#!/cygdrive/c/python34/python
# Program to check the data files.  This program is run the most frequently.  It checks
# to make sure that the birds in the inventory database have jpeg files and vice versa.
# Also, make sure that the birds/places in the bird_class and place_class have entries
# in the inventory database and also the .jpg file mentioned points to a valid entry.
# Additionally, the bird_facts database is checked to make sure they are valid birds and
# that some error checking is done on the fields.
# Other data checking is also done on the various database mentioned.  This program should
# be run each time the databases are updated.
import pdb
import subprocess
from bird_utils import ls_cmd
from bird_utils import get_image_size
import openpyxl
from openpyxl import load_workbook
import numbers
#import warnings

# I needed this ignore warning for the bird_facts database, but I was able to fix it by
# writing it out as a tab delimited file and than reading it back it making sure the
# egg column was text
#warnings.filterwarnings("ignore")
# I do this because of irritating warning error with openpyxl
# get
#     C:\python34\lib\site-packages\openpyxl\workbook\names\named_range.py:125:
#        UserWarning: Discarded range with reserved name
#     warnings.warn("Discarded range with reserved name")

# read in the bird_facts database and check for errors

# read in the inventory and place databases and check for errors
def read_excel (row):
    fields=[]
    num = 0
            # only want the first 7 cells in a row
    for cell in row:
        if cell.value == None:
            fields.append("")           # if nothing in cell, make it an empty string
        else:
            fields.append(cell.value)
        num += 1
        if num > 7: break

		# now check for the data for errors

        # is date field a digit and is it a year of (2000-3000)
    if not isinstance(fields[0], numbers.Integral):
        print (fields[3] + ': not a digit in date field: ' + str(fields[0]))
    elif fields[0] < 20000101 or fields[1] > 30000000:
        print (fields[3] + ': not a valid date in date field: ' + str(fields[0]))

        # check the priority field, < 1
    if not isinstance(fields[1], numbers.Integral):
        print (fields[3] + ': not an integer in priority field: ' + str(fields[1]))
    elif fields[1] < 1:
        print (fields[3] + ': priority field less than 1: ' + str(fields[1]))

        # check the breeding field -- 0 to 4 (0 means not a real bird - bird are 1-4)
    if not isinstance(fields[2], numbers.Integral):
        print (fields[3] + ': not a integer for breeding info: ' + str(fields[2]))
    elif fields[2] < 0 or fields[2] > 4:
        print (fields[3] + ': breeding data not between 1-4: ' + str(fields[2]))

        # check to make sure the image name, 'bird' name, and locations are text strings
        # and the length of the string is greater than 0
    if not isinstance(fields[3], str):
        print (fields[3] + ': May have erroneous image name -- not a string: ' + str(fields[3]))
        fields[3] = fields[3].rstrip
    if not fields[3]:
            print (fields[3] + ': No text found in image name: ' + str(fields[3]))

    if not isinstance(fields[4], str):
        print (fields[3] + ': May have erroneous bird name -- not a string: ' + str(fields[4]))
        fields[4] = fields[4].rstrip
    if not fields[4]:
            print (fields[3] + ': No text found in bird name: ' + str(fields[4]))

    if not isinstance(fields[5], str):
        print (fields[3] + ': May have erroneous location -- not a string: ' + str(fields[5]))
        fields[5] = fields[5].rstrip
    if not fields[5]:
            print (fields[3] + ': No text found in location: ' + str(fields[5]))

    return fields
#
# Main Routine
#

im = {}			# files from 2000's in 'Images' subdirectory
full = {}		# files from 2000's in 'Images/Full-res' subdirectory
im_sm = {}		# files from 2000's in 'Images_sm' subdirectory
inv = {}		# holds the picture info from the inventory db
breed = {}		# holds the info on the breeding status
birds = {}		# from the inventory db, the name of the birds
places = {}		# from the places db, the name of the places
class_birds = {}	# from bird_class, all the birds
class_places = {}	# from place_class, all the places
comnd = 'ls -R'
pathn = '../2*'

# First get a list of the files in the picture directories.
# What is returned by 'ls' is the various sub-directories
# on separate lines followed by the files in that directory.
# Each file is on a separate line. 
# Three dictionaries are created, one for each of the main
# sub-directories
#pdb.set_trace()
files = ls_cmd("-R", pathn)
for i in files:
                # looking for the three subdirectories that contain the files
    if 'Images' in i or 'Full_res' in i or 'Images_sm' in i:
        fields = i.split('/')
        if len(fields) > 2:
            date = fields[1] + fields[2]
        if 'Images:' in i:
            sub = 'Images:'
        elif 'Full_res:' in i:
            sub = 'Full_res:'
        elif 'Images_sm:' in i:
            sub = 'Images_sm:'
                # looking for the .jpg files
    elif '.jpg' in i.lower():
                # want to skip over files with '-mod' because they should have
                # the original file
        if '_mod' in i:
            continue
        i = i[0:-4]
        ii = date + ':' + i
        if 'Images:' in sub:
            im[ii] = 1
        elif 'Full_res:' in sub:
            full[ii] = 1
        elif 'Images_sm:' in sub:
            im_sm[ii] = 1

imkeys = list(im.keys())
imkeys.sort()

flkeys = list(full.keys())
flkeys.sort()

smkeys = list(im_sm.keys())
smkeys.sort()

            # read in the inventory database, do some error checking
            # and make sure each entry has a corresponding .jpg file in 
            # Images, Images_sm, and Full_res.
print ('Checking the inventory database against the .jpg files:')
        # open the inventory database and read the rows
wb = openpyxl.load_workbook('inventory.xlsx', read_only=True)
active = wb.active
sheet = wb['inventory'] # ws is now an Iterableworksheet
for row in sheet.rows:
    fields = read_excel (row)

            # only want the year and month of the date field which is
            # concantenated with the picture name (minus .jpg) which is
            # how the .jpg pictures are stored
    fields[0]=str(fields[0])
    value = fields[0][0:6] + ':' + fields[3]
    if not value in im:
        #pdb.set_trace()
        print (fields[0] + fields[3] + ' not in Images')
    if not value in full:
        print (fields[0] + fields[3] + ' not in Full_res')
    if not value in im_sm:
        print (fields[0] + fields[3] + ' not in Images_sm')
            # store inventory date+file, first making sure
            # there is not already an image with the same date (duplicate 
            # and errorneous database error)
    if (value in inv):
        print ('----Duplicate inventory database entry for: ' + value)
    fields[4] = fields[4].lstrip()
    fields[4] = fields[4].rstrip()
    inv[value] = fields[4] + ':' + str(fields[2])
	    # save the bird name (key='date:IMG_xxxx') for later checking agains bird_class
    birds[fields[4]] = 1

#pdb.set_trace()
            # read in the place database, do some error checking
print ('Start checking the place database against the .jpg files:')
wb_p = openpyxl.load_workbook('place.xlsx', read_only=True)
active_p = wb_p.active
sheet_p = wb_p['place'] # ws is now an Iterableworksheet
for row in sheet_p.rows:
    fields = read_excel (row)

    fields[0]=str(fields[0])
    value = fields[0][0:6] + ':' + fields[3]
    if not value in im:
        print (----fields[0] + fields[3] + ' not in Images')
    if not value in full:
        print (----fields[0] + fields[3] + ' not in Full_res')
    if not value in im_sm:
        print (----fields[0] + fields[3] + ' not in Images_sm')
            # store inventory date+file and breeding dates making sure
            # there is not already an image with the same date (duplicate 
            # and errorneous database error)
    if (value in inv):
        print ('----Duplicate property database entry for: ' + value)
            # store the birds name and breeding status, for
            # later checking with bird_class
    fields[4] = fields[4].lstrip()
    fields[4] = fields[4].rstrip()
    inv[value] = fields[4] + ':' + str(fields[2])
            # save the place name for later checking agains place_class
    places[fields[4]] = 1


            # now check the .jpg files against the inv & place databases making
	    # sure that each image file (3 versions of each file) has an entry
	    # in the database
print ('Start checking the .jpg files against the database:')
print ('  Checking files in Image subdirectory')
for k in imkeys:
    if not k in inv:
        #pdb.set_trace()
        print ('----Not in inventory database: ' + k )
print ('  Checking files in Full-res subdirectory')
for k in flkeys:
    if not k in inv:
        print ('----Not in inventory database: ' + k )
print ('  Checking files in Image-sm subdirectory')
for k in smkeys:
    if not k in inv:
        print ('----Not in inventory database: ' + k )

        # read in the bird_facts.txt database
print ('Checking bird_facts database')

wbf = openpyxl.load_workbook('bird_facts.xlsx', read_only=True)
activef = wbf.active
sheetf = wbf['bird_facts'] # ws is now an Iterableworksheet

b_facts = []
for line in sheetf.rows:
            # only want the first 10 cells in a row
    fields=[]
    num = 0
    for cell in line:
        if cell.value == None:
            fields.append("")           # if nothing in cell, make it an empty string
        else:
            fields.append(cell.value)
        num += 1
        if num > 9:
            break

    b_facts.append(fields[0])
   
                # first check out the 2 lengths and wingspan
    #pdb.set_trace()
    for i in range(1,4):    # Handle all the inches/feet
        v = fields[i].replace (' ','')
        if not v: continue      # empty field
        if 'N/A' in v: continue
        if not ('in' in v or 'ft' in v or 'cm' in v or 'm' in v):
            print ('  Not in, ft, or cm in 1-3 fields: ', fields[0])
        v = v.replace ('in','')
        v = v.replace ('ft','')
        v = v.replace ('cm','')
        v = v.replace ('m','')
        if '-' in v:
            sfield = v.split('-')
            try:
                x = float(sfield[0])
            except ValueError:
                print ('  ', sfield[0], '  is not a number in: ', fields[0])
            try:
                y = float(sfield[1])
            except ValueError:
                print ('  ', sfield[1], ' is not a number in: ', fields[0])
            if x > y:
                print ('  2nd length number less than 1st: ', fields[0])
        else:
            try:
                x = float(i)
            except ValueError:
                print (' ', i, '  is not a number in: ', fields[0])

                # handle the two weight fields
    for i in range(4,6):
        v = fields[i].replace (' ','')
        if not v: continue      # empty field
        if not ('g' in v or 'kg' in v or 'oz' in v or 'lb' in v or 'NoInfo' in v):
            print ('  Not g, kg, oz, or lb in 4-5 fields: ', fields[0])
        v = v.replace ('kg','')
        v = v.replace ('g','')
        v = v.replace ('oz','')
        v = v.replace ('lb','')
        if '-' in v:
            sfield = v.split('-')
            x = 0
            y = 0
            try:
                x = float(sfield[0])
            except ValueError:
                print ('  ', sfield[0], '  is not a number in: ', fields[0])
            try:
                y = float(sfield[1])
            except ValueError:
                print ('  ', sfield[1], ' is not a number in: ', fields[0])
            if x > y:
                print ('  2nd weight number less than 1st: ', fields[0])
        else:
            try:
                x = float(v)
            except ValueError:
                if not 'NoInfo' in v:
                    print (' ', v, '  is not a number in: ', fields[0])

                    # check on breeding status, entry in fields[4]
    fields[9] = fields[9].replace('\s','')
    aname = fields[0].lstrip()
    aname = aname.rstrip()
                    # check on number of eggs, field must either be blank, have a '-',
		    # a single number, or two numbers seperated by a '--'.  A '+' sign is
		    # also acceptable
    tmp = fields[6].replace('+','')
    eggs = tmp.split('--')
    #pdb.set_trace()
    if len(eggs) == 1:
	    if not(eggs[0] == "" or eggs[0].isdigit()):
		    print ('Problem with the egg field on line: ', line)
    elif len(eggs) == 2:
	    if not eggs[0] == "":
		    if not eggs[0].isdigit():
		        print ('Problem with the egg field on line: ', line)
		    if not eggs[1].isdigit():
		        print ('Problem with the egg field on line: ', line)
		    if not (int(eggs[1]) > int(eggs[0])):
		        print ('Problem with the egg field on line: ', line)
    else:
		    print ('Problem with the egg field on line: ', line)
		    # check the Altricial/Precocial field
    if fields[7] not in 'Precocial Altricial Semi-precocial Semi-altricial':
	    print ('Problem with Alricial/Precocial field: ', line)
		    # check the endangered status
    if fields[8] not in 'Least Concern Vulnerable Not Evaluated Data Deficient Not recognized Endangered (by U.S.) Critically Endangered Near Threatened Not determined Extinct in the Wild':
            print ('Problem with endangered field: ', line)
                    # save the breeding dates for later checking
    if (fields[8]):
        breed[aname] = 1

#pdb.set_trace()
        # now run through the inventory and make sure that the breeding dates are correct
keys = list(inv.keys())
for k in keys:
    fields = inv[k].split(':')
    if fields[1] == '2' or fields[1] == '3':
        if not fields[0] in breed:
            #pdb.set_trace()
            t = 1	# just a garbage line until I fix the breeding status
            #print ('----Incorrect breeding status: ' + inv[k])
    elif fields[1] == '4':
        if fields[0] in breed:
            t = 1	# just a garbage line until I fix the breeding status
            #print ('----Incorrect breeding status: ' + inv[k])



        # now read in and check the class databases (bird and place) against the
        # inventory database
print ('Checking bird_class database')
filein = open('bird_class')
for line in filein:
    line = line.rstrip('\n')
        # only look at bird lines
    if 'bird:' in line[0:5]:
        fields = line.split(':')
                # fields[3] hold the picture path info
        ff = fields[3].split ('/')
                # gets year, month and pic name (-1)
        pic = ff[0]+ff[1]+':'+ff[-1]
                # get rid of .jpg extension
        pic = pic[0:-4]
                # get rid of spaces
        pic = pic.replace(' ','')
                # check for it in inventory database
        if not pic in inv:
            print ('----Following bird picture not in inventory database: ' + line)

                # save the names for later checking against birds in inventory
        aname = fields[1].lstrip()
        aname = aname.rstrip()
        bname = aname.replace(' ','')

        class_birds[aname] = bname

filein.close()

	# now check that the birds in the facts database have an entry in bird_class
for abird in b_facts:
    if not abird in class_birds:
        #pdb.set_trace()
        print ('Following bird from bird_facts database not in bird_class: ', abird)
	# now make sure that all the bird in bird_class have an entry in b_facts
for abird in class_birds:
    if not abird in b_facts:
        if not (abird == 'Just for Fun' or abird == 'Oil Spill'):
            pdb.set_trace()
            print ('Following bird from bird_class database not in bird_facts: ', abird)

        # Now check the place class database against the inventory database
print ('Checking place database')
filein = open('place_class')
for line in filein:
    line = line.rstrip('\n')
    if 'place:' in line[0:6]: 
        fields = line.split(':')
                # fields[2] hold the picture path info
        ff = fields[2].split ('/')
                # gets year, month and pic name (-1)
        pic = ff[0]+ff[1]+':'+ff[-1]
                # get rid of .jpg extension
        pic = pic[0:-4]
                # get rid of spaces
        pic = pic.replace(' ','')
                # check for it in inventory database
        if not pic in inv:
            print ('----Following place picture not in inventory database: ' + line)
                # save the names for later checking against places in place_class
        fields[1] = fields[1].lstrip()
        fields[1] = fields[1].rstrip()
        class_places[fields[1]] = 1
filein.close()

	# check to make sure the feet entries have an entry in bird_class
        # first get the list of names of birds without spaces from bird_class
print ('Checking feet files against bird_class')
birdsl = class_birds.values()
pathn = '../feet/*.jpg'
files = ls_cmd(None, pathn)
for line in files:
     beg = line.rfind('/')
     line = line[beg+1:]
     if '_' in line:
         end1 = line.rfind('_')
         line = line[:end1]
         if not line in birdsl:
             print ("  Following foot not in bird_class database", line)
     else:
         if len(line) > 0:
            print (" Following foot does not have '_n' extension: ", line)

	# check to make sure the tongue entries have an entry in bird_class
        # first get the list of names of birds without spaces from bird_class
print ('Checking tongue files against bird_class')
birdsl = class_birds.values()
pathn = '../tongue/*.jpg'
files = ls_cmd(None, pathn)
for line in files:
     beg = line.rfind('/')
     line = line[beg+1:]
     if '_' in line:
         end1 = line.rfind('_')
         line = line[:end1]
         if not line in birdsl:
             print ("  Following tongue not in bird_class database", line)
     else:
         if len(line) > 0:
            print (" Following tongue does not have '_n' extension: ", line)


        # Now I need to make sure all the birds and places in the inventory
        # and place database have an entry in the in_class database
print ('Checking to make sure all bird and place in class databases')
keys = list(birds.keys())
for k in keys:
    if not k in class_birds:
        print ('----Birds not in birds_class database: ' + k)
keys = list(places.keys())
for k in keys:
    if not k in class_places:
        #pdb.set_trace()
        print ('----Place not in place_class database: ' + k)

print ('Checking to make sure that bird entries in bird_class have pictures in inventory')
	# check that birds in the bird_class database have entries in inventory
akeys = list(class_birds.keys())
for k in akeys:
    if not k in birds:
	    print ('----Bird in bird_class but not in ventory:' + k)

        # Check for large files which probably means that the resolution is higher
        # than I want.  Usually, the maximum pixels in the longest side is 2500 pixels.
print ('Checking for large files in Images subdirectories')
pathn = '../2*/*/Images/*.jpg'
#pdb.set_trace()
files = ls_cmd('-s', pathn)
files.sort(reverse=True)
i = 0
for line in files:
    if i > 30: break
    fields = line.split(' ')
    if int(fields[0]) > 2700:
        print ('----File too large: ', line)
    i += 1

    width, height = get_image_size(fields[1])
    if width > 2500 : print ('File: ', fields[1], ' width:', width, ' (> 2500)')
    if height > 2500 : print ('File: ', fields[1], ' height:', height, ' (> 2500)')
