python source code of Form

from __future__ import absolute_import
import re
import requests
from requests.exceptions import RequestException
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
    LTTextBox, LTTextLine, LTTextBoxHorizontal, LAParams, LTChar,
    LTTextLineHorizontal
)
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdftypes import resolve1
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

from . import ut, irs
from .ut import log, ntuple, logg, stdout, Qnty, NL, pathjoin
from .config import cfg
from .xmp import xmp_to_dict
from .cmds import CommandParser, normalize, adjustNegativeField, CannotParse

# global so that theyre pickle-able
PageInfo = ntuple('PageInfo', 'pagenum pagewidth pageheight textpoz')
TextPozStruct = ntuple('TextPozStruct', 'text bbox chars charobjs')


class Form(object):
    def __init__(self, name, recurselevel):
        self.formName = name
        self.name = name
        self.recurselevel = recurselevel
        self.fields = []
        self.draws = []
        self.refs = []
        self.computedFields = ut.odict()
        self.upstreamFields = set()
        self.isCryptic = False
        try:
            form,sched = name
            self.nameAsTuple = name
        except ValueError:
            try:
                form,sched = name.split('s',1)
                assert form and sched
                self.nameAsTuple = form,sched
            except (ValueError,AttributeError):
                self.nameAsTuple = name, None

    def __eq__(self,o):
        return self.nameAsTuple==o.nameAsTuple

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        return '<Form %s>' % (self.name, )

    def getFile(self, failurls):
        if hasattr(self.name, 'endswith') and self.name.endswith('.pdf'):
            # pdf suffix means the file is local
            fname = self.name
            url = None
        else:
            fname, url = self.download(cfg.formyear, failurls)
        log.name = fname
        self.fname = fname
        self.url = url

    def readInfo(self):
        prefix = self.fname.rsplit('.', 1)[0]
        log.name = prefix
        self.prefix = prefix
        pathPlusPrefix = pathjoin(cfg.pdfDir, prefix)
        self.fpath = pathPlusPrefix + '.pdf'
        cacheprefix = pathPlusPrefix + '-pdfinfo'
        infocache = None if not cfg.useCaches else ut.unpickle(cacheprefix)
        if infocache is None:
            self.docinfo, self.pageinfo = self.pdfInfo()
            ut.pickle((self.docinfo, self.pageinfo), cacheprefix)
        else:
            self.docinfo, self.pageinfo = infocache
        # todo should store this separately from self.name?
        self.name = self.docinfo['formName']

    def fixBugs(self):
        if cfg.formyear in ('2012', '2013'):
            for f in self.fields:
                # fix an error in 2012,2013 f1040 [in which there is no line
                # 59, only 59a and 59b] todo autodetect such errors by matching
                # draw text w/ field text?
                if 'Line 59. Cents.' in f['speak']:
                    f['speak'] = f['speak'].replace('Line 59', 'Line 60')

    def download(self, year, failurls):
        # download form from irs.gov into cfg.pdfDir if not already there
        formName = self.name
        year = int(year)
        formNamesToTry = irs.possibleFilePrefixes(formName)
        msgs = []
        foundfile = False
        url = ''
        if year < cfg.latestTaxYear:
            fnametmpl = '%(formName)s--%(year)s.pdf'
        else:
            fnametmpl = '%(formName)s.pdf'
        for formName in formNamesToTry:
            fname = fnametmpl % dict(formName=formName, year=year)
            destfname = formName + '.pdf'
            destfpath = pathjoin(cfg.pdfDir, destfname)
            if ut.exists(destfpath):
                foundfile = True
                break
        if not foundfile:
            for formName in formNamesToTry:
                fname = fnametmpl % dict(formName=formName, year=year)
                destfname = formName + '.pdf'
                destfpath = pathjoin(cfg.pdfDir, destfname)
                if not cfg.okToDownload:
                    msg = 'oops no ' + destfpath + ' and not okToDownload'
                    logg(msg, [log.error, stdout])
                    exit()
                try:
                    if year < cfg.latestTaxYear:
                        url = irs.prevurltmpl % (fname, )
                    else:
                        url = irs.currurltmpl % (fname, )
                    url = url.encode('utf-8')
                    if url in failurls:
                        continue
                    log.warn(
                        'downloading: ' + formName + ' from ' + str(url))
                    response = requests.get(url)
                    if response.status_code != 200:
                        # not a pdf, just an html error page
                        continue
                    pdf = response.content
                    fout = open(destfpath, 'wb')
                    fout.write(pdf)
                    fout.close()
                    foundfile = True
                    break
                except RequestException as e:
                    msgs.append('RequestException at ' + url)
                    msgs.append(str(e))
                    failurls.add(url)
                    log.error(e)
        if not foundfile:
            if not msgs:
                msgs.append('url does not exist:  {}'.format(url))
            raise Exception('\n'.join(msgs))
        return destfname, url

    def pdfInfo(self):
        # collect metadata from pdf file at document and page levels
        with open(self.fpath, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)
            docinfo = {}
            orgnIsIrs = True
            if 'Metadata' in doc.catalog:
                metadata = resolve1(doc.catalog['Metadata']).get_data()
                xmpdict = xmp_to_dict(metadata)
                docinfo['titl'] = xmpdict['dc']['title']['x-default']
                docinfo['desc'] = xmpdict['dc'].get('description',{}).get('x-default')
                docinfo['isfillable'] = (
                    xmpdict['pdf'].get('Keywords', '').lower() == 'fillable')
                anyMonth = 'Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec'
                titlePttn1 = re.compile(ut.compactify(
                    r'''(?:(\d\d\d\d) )?   # 2016
                    Form ([\w-]+           # Form 1040
                    (?: \w\w?)?)           # AS
                    (?: or ([\w-]+))?      # or 1040A
                    (?:  ?\(?(?:Schedule ([\w-]+))\)?)?  # (Schedule B)
                    (?:  ?\((?:Rev|'''+anyMonth+''').+?\))?\s*$'''
                    ))
                # eg 2016 Form W-2 AS
                # eg 2015 Form 1120 S (Schedule D)
                # eg 2015 Form 990 or 990-EZ (Schedule E)
                # eg Form 8818  (Rev. December 2007)
                # eg Form 8849  (Schedule 2)  (Rev. January 2009)
                # eg Form 1066 (Schedule Q) (Rev. December 2013)
                # eg Form 1120S Schedule B-1 (December 2013)
                # 'Rev' means 'revised'
                m = re.search(titlePttn1, docinfo['titl'])
                if m:
                    taxyr, form1, form2, sched = m.groups()
                else:
                    titlePttn2 = re.compile(ut.compactify(
                        r'''(?:(\d\d\d\d) )?   # 2016
                        Schedule ([\w-]+)[ ]   # Schedule B
                        \(Form ([\w-]+)        # (Form 1040
                        (?: or ([\w-]+))? ?\)  # or 1040A)
                        (?: \((?:Rev|'''+anyMonth+''').+?\))?\s*$''',
                        ))
                    # eg 2015 Schedule M-3 (Form 1065)
                    # eg 2015 Schedule O (Form 990 or 990-EZ)
                    # eg Schedule O (Form 1120) (Rev. December 2012)
                    # eg Schedule C (Form 1065 ) (Rev. December 2014)
                    m = re.search(titlePttn2, docinfo['titl'])
                    if m:
                        taxyr, sched, form1, form2 = m.groups()
                    else:
                        msg = docinfo['titl'] + ' dont fit form title templates'
                        log.error(msg)
                        #raise Exception(msg)
                        orgnIsIrs = False
                if orgnIsIrs:
                    docinfo['taxyr'] = taxyr
                    form = form1 if not form2 or len(form1) < len(form2) else form2
                    docinfo['form'] = form
                    docinfo['sched'] = sched
                    docinfo['formName'] = form if not sched else (form, sched)
                    docinfo['fpath'] = self.fpath
                else:
                    # experimental, for CRA forms
                    docinfo['formName'] = docinfo['titl'].replace(' ', '')
            # Check if the document allows text extraction. If not, abort.
            if not doc.is_extractable:
                raise Exception('PDFTextExtractionNotAllowed')
            pageinfo = {}
            rr = Renderer()
            # for ipage,page in enumerate(doc.get_pages()):
            for ipage, page in enumerate(PDFPage.create_pages(doc)):
                pagenum = 1 + ipage
                if page.cropbox != page.mediabox:
                    log.warn(
                        'boxesDontMatch: cropbox!=mediabox on page %d:'
                        ' cropbox=%s; mediabox=%s',
                        pagenum, page.cropbox, page.mediabox)
                pagewidth = Qnty(
                    page.cropbox[2] - page.cropbox[0], 'printers_point')
                pageheight = Qnty(
                    page.cropbox[3] - page.cropbox[1], 'printers_point')
                pageinfo[pagenum] = PageInfo(
                    pagenum, pagewidth, pageheight, rr.renderPage(page))
        return docinfo, pageinfo

    def orderDependencies(self):
        # reorder by deps to avoid undefined vars
        computedFields = self.computedFields
        self.upstreamFields.difference_update(computedFields.keys())
        delays = []
        upstreamFieldsList = list(self.upstreamFields)
        for name, f in computedFields.items():
            for depfield in f['deps']:
                if depfield['uniqname'] not in upstreamFieldsList:
                    delays.append(name)
        for name in delays:
            val = computedFields[name]
            del computedFields[name]
            computedFields[name] = val

    def computeMath(self):
        # determines which fields are computed from others
        # 'dep' means dependency
        fields = self.fields if 'm' in cfg.steps else []
        for field in fields:
            math = CommandParser(field, self)
            speak = normalize(field['speak'])
            adjustNegativeField(field, speak)
            colinstruction = normalize(field['colinstruction'])
            instruction = colinstruction if colinstruction else speak
            sentences = re.split(r'\.\s*', instruction)
            for s in sentences:
                try:
                    math.parseInstruction(s, field)
                    log.debug('found [%s] in sentence [%s] in field %s',math,s,field['uniqname'])
                except CannotParse as e:
                    log.debug('%s',e)
            if math and math.terms:
                # todo checkbox instructions refer to the named textbox
                # eg 2016/8814/line15
                #    p1-cb2    Line 15. Tax. Is the amount on line 14 less than $1,050? No. Enter $105 here and see the Note below. Note. If you checked the box on line C above, see the instructions. Otherwise, include the amount from line 15 in the tax you enter on Form 1040, line 44, or Form 1040N R, line 42. Be sure to check box a on Form 1040, line 44, or Form 1040N R, line 42.
                #    p1-cb2L1T Line 15. Yes. Multiply line 14 by 10 percent (.10). Enter the result here and see the Note below. Note: If you checked the box on line C above, see the instructions. Otherwise, include the amount from line 15 in the tax you enter on Form 1040, line 44, or Form 1040N R, line 42. Be sure to check box a on Form 1040, line 44, or Form 1040N R, line 42.
                #    p1-t37    Line 15. Tax. Dollars.
                #    p1-t38    Line 15. Cents.
                # for now we just suppress the math here
                if field['typ']!='checkbox':
                    math.assembleFields()
            field['math'] = math
        self.orderDependencies()
        self.bfields = [ut.Bag(f) for f in fields]
        # just to shorten field['a'] to field.a


class Renderer(object):
    def __init__(self):
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # la=layout analysis
        laparams = LAParams()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        self.textPoz = None

    def renderPage(self, page):
        self.interpreter.process_page(page)
        layout = self.device.get_result()
        # http://denis.papathanasiou.org/2010/08/04/extracting-text-images-
        # from-pdf-files/
        textPoz = TextPoz()
        for lt in layout:
            if isinstance(lt, (LTTextBoxHorizontal, LTTextBox, LTTextLine)):
                textPoz.add(lt)
        return textPoz


class TextPoz(object):
    # text positions
    FormPos = ntuple('FormPos', 'itxt ichar chrz bbox')

    def __init__(self):
        self.textPoz = []

    def add(self, ltobj):
        def quantify(tupl, unit):
            return [Qnty(qnty, unit) for qnty in tupl]

        def accum(ltobj, ltchars, chars):
            for lto in ltobj:
                if isinstance(lto, LTChar):
                    ltchartext = lto.get_text()
                    ltchars.append(
                        (ltchartext,
                         ut.Bbox(*quantify(lto.bbox, 'printers_point'))))
                    chars.append(ltchartext)
                elif isinstance(lto, LTTextLineHorizontal):
                    accum(lto, ltchars, chars)
        ltchars = []
        chars = []
        accum(ltobj, ltchars, chars)
        self.textPoz.append(TextPozStruct(
            ltobj.get_text(),
            ut.Bbox(*quantify(ltobj.bbox, 'printers_point')),
            ''.join(chars),
            ltchars))

    def optimize(self):
        # todo ensure objs are arranged by line (ie ypos) and left-to-right
        # within line
        pass

    def find(self, s):
        if not s:
            raise Exception('empty string')

        def findstr(sl, found):
            for itxt, (txt, bbx, chrz, charobjs) in enumerate(self.textPoz):
                chrz = chrz.lower()
                # require target to be bordered by start/end of string,
                # whitespace, or punctuation to avoid matching a mere subset of
                # the actual form referenced [eg to avoid finding '1040' in
                # '1040EZ']
                slsafe = re.escape(sl)
                slsafeExact = r'(?:^|[\s\W])(' + slsafe + r')(?:$|[\s\W])'
                log.info('slsafeExact=%s  chrz=%s', slsafeExact, chrz)
                for m in re.finditer(slsafeExact, chrz):
                    if m:
                        ichar = m.start()
                        bbox1 = charobjs[ichar][1]
                        bbox2 = charobjs[ichar + len(sl) - 1][1]
                        if not (bbox1.y0 == bbox2.y0 and bbox1.y1 == bbox2.y1):
                            log.info('bbox.y coords dont match for [%s]', sl)
                        bbox = ut.merge(bbox1, bbox2)
                        found.append(
                            TextPoz.FormPos(
                                itxt, ichar,
                                chrz[ichar: ichar + len(sl)], bbox))
                    else:
                        break
            return found
        sl = s.lower()
        found = []
        while not found:
            found = findstr(sl, found)
            if not found and ' ' in sl:
                sl = sl.rsplit(None, 1)[-1]
            else:
                break
        if not found:
            log.warn('textNotFound: ' + s + ' in ' + self.alltext().replace(
                NL, ' [newline] '))
        if len(found) > 1:
            msgtmpl = 'textRepeats: found too many (returning all of them),' \
                ' seeking %s in %s ... [run in debug mode for fulltext]: %s'
            log.info(
                msgtmpl, s, self.alltext().replace(NL, '  ')[:60], str(found))
            log.debug(' fulltext: seeking %s in %s',
                      s, self.alltext().replace(NL, '  '))
        return found

    def alltext(self):
        return NL.join(o.text for o in self.textPoz)