import os import subprocess import requests import re from osp.common import config from bs4 import BeautifulSoup from PyPDF2 import PdfFileReader from docx import Document from datetime import datetime def requires_attr(attr): """ If the instance doesn't have an attribute, return None. Args: attr (str): The required attribute. Returns: function: The decorated function. """ def decorator(func): def wrapper(self, *args, **kwargs): if getattr(self, attr, None): return func(self, *args, **kwargs) else: return None return wrapper return decorator def int_to_dir(i): """ Convert an integer offset to a segment name. Args: i (int): The integer offset. Returns: str: The segment directory name. """ return hex(i)[2:].zfill(3) def html_text(path, exclude=['script', 'style']): """ Convert HTML to text. Args: path (str): The file path. exclude (list): A list of tags to ignore. Returns: str: The extracted text. """ with open(path, 'rb') as fh: soup = BeautifulSoup(fh, 'lxml') for script in soup(exclude): script.extract() return soup.get_text() def pdf_text(path): """ Convert a PDF to text. Args: path (str): The file path. Returns: str: The extracted text. """ cmd = os.path.join(config['osp']['bin'], 'pdf2txt.py') txt = subprocess.check_output([cmd, path]) return txt.decode('utf8') def docx_text(path): """ Convert to plaintext with LibreOffice. Args: path (str): The file path. Returns: str: The extracted text. """ with open(path, 'rb') as fh: r = requests.put( config['tika']['server'], headers={'Accept': 'text/plain'}, data=fh.read() ) return r.text def pdf_date(path): """ Extract a date from PDF file metadata. Args: path (str): The file path. Returns: datetime: The created date. """ reader = PdfFileReader(path) # Get rid of `D:` prefix and timezone. stamp = reader.documentInfo['/CreationDate'] match = re.search('\d+', stamp) return datetime.strptime( match.group(), '%Y%m%d%H%M%S' ) def docx_date(path): """ Extract a date from DOCX file metadata. Args: path (str): The file path. Returns: datetime: The created date. """ return Document(path).core_properties.created