python source code of Converter

#!/usr/bin/env python
import logging
import docx2txt
from zipfile import ZipFile
# from pptx import Presentation
from subprocess import Popen, PIPE, STDOUT
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO


class Converter(object):

    def __init__(self, verbose=False):
        try:
            self.logger = logging.getLogger("SimplyEmail.Converter")
            self.verbose = verbose
        except Exception as e:
            print e

    def convert_docx_to_txt(self, path):
        """
        A very simple conversion function
        which returns unicode text for
        parsing.

        path = The path to the file
        """
        # https://github.com/ankushshah89/python-docx2txt
        try:
            text = docx2txt.process(path)
            self.logger.debug("Converted docx to text: " + str(path))
            return unicode(text)
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to DOCX to text: " + str(e))

    def convert_doc_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing.

        path = The path to the file
        """
        try:
            cmd = ['antiword', path]
            p = Popen(cmd, stdout=PIPE, stderr=STDOUT)
            stdout, stderr = p.communicate()
            return stdout.decode('ascii', 'ignore')
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to DOC to text: " + str(e))

    # def convert_pptx_to_txt(self, path):
    #     prs = Presentation(path)
    #     # text_runs will be populated with a list of strings,
    #     # one for each text run in presentation
    #     text_runs = ""
    #     try:
    #         for slide in prs.slides:
    #             try:
    #                 for shape in slide.shapes:
    #                     if not shape.has_text_frame:
    #                         continue
    #                     for paragraph in shape.text_frame.paragraphs:
    #                         for run in paragraph.runs:
    #                             text_runs += str(run.text) + ' '
    #             except:
    #                 pass
    #         return text_runs
    #     except Exception as e:
    #         if text_runs:
    #             return text_runs
    #         else:
    #             text_runs = ""
    #             return text_runs
    #         self.logger.error("Failed to convert pptx: " + str(e))

    def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e))

    def convert_Xlsx_to_Csv(self, path):
        # Using the Xlsx2csv tool seemed easy and was in python anyhow
        # it also supported custom delim :)
        self.logger.debug("convert_Xlsx_to_Csv on file: " + str(path))
        try:
            cmd = ['xlsx2csv', path]
            p = Popen(cmd, stdout=PIPE, stderr=STDOUT)
            stdout, stderr = p.communicate()
            text = stdout.decode('ascii', 'ignore')
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to convert_Xlsx_to_Csv to text: " + str(e))

    def convert_zip_to_text(self, path, rawtext=True):
        # http://stackoverflow.com/questions/10908877/extracting-a-zipfile-to-memory
        try:
            self.logger.debug("Attempting to unzip file: " + str(path))
            input_zip = ZipFile(path)
            if rawtext:
                text = ""
                a = {name: input_zip.read(name) for name in input_zip.namelist()}
                for x in a:
                    try:
                        text += str(a[x])
                    except Exception as e:
                        print e
                        # pass
                self.logger.debug("Unzip of file complted (raw text): " + str(path))
                return text
            else:
                return {name: input_zip.read(name) for name in input_zip.namelist()}
        except Exception as e:
            print e
            text = ""
            return text
            self.logger.error(
                "Failed unzip file: " + str(e))