#!/usr/bin/python
import os,sys,os.path,glob,re,string,random

# FIXME
# -- better documentation
# -- better command line error checking
# -- use temporary file names instead of _truth, _ocr
# -- more informative error messages
# -- record machine, processing times
# -- get rid of ./distance
# -- output data into sqlite database
# -- use MD5 checksums on executables, page images

ocropus = "../ocropus-cmd/ocropus"
distance = "../ocropus-cmd/ocr-distance"
simp_re = re.compile(r'[^a-zA-Z0-9,.+=/*?! \n-]+')

def simplify(s):
    s = simp_re.sub('',s)
    s = re.sub(r'^\s+','',s)
    s = re.sub(r'\s+','\n',s)
    return s

def die(s):
    sys.stderr.write("FATAL: %s\n"%s)
    sys.exit(1)
def ensure(s):
    if not os.path.exists(s):
        die("%s: file not found" % s)

def compare_ocr(truth,ocr,cost):
    truth_simp = simplify(truth)
    ocr_simp = simplify(ocr)
    stream = open("_truth","w")
    stream.write(truth_simp)
    stream.close()
    stream = open("_ocr","w")
    stream.write(ocr_simp)
    stream.close()
    stream = os.popen("%s _truth _ocr %s" % (distance,cost))
    result = stream.read()[:-1]
    stream.close()
    os.unlink("_truth")
    os.unlink("_ocr")
    return float(result)

ensure(ocropus)
ensure(distance)

data = []

directory = "../data/pages"
if len(sys.argv)>1: directory = sys.argv[1]
pages = glob.glob(directory+"/*.png")
pages = [page for page in pages if os.path.exists(os.path.splitext(page)[0]+".txt")]
sys.stderr.write("evaluating the %d pages with ground truth in %s\n" % (len(pages),directory))

print "directory",directory
for page in pages:
    sys.stderr.write("=== %s ===\n" % page)
    print "page",page
    root,ext = os.path.splitext(page)
    truth = open(root+".txt","r").read()
    cmd = "env hocr=0 remove_hyphens=0 %s ocr %s" % (ocropus,page)
    ocr = os.popen(cmd,"r").read()
    for cost in [1]+range(5,51,5):
        result = compare_ocr(truth,ocr,cost)
        print "cost",page,cost,result
        data.append((cost,result))
        sys.stdout.flush()
