import os
import subprocess
import requests
import re

from osp.common import config
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileReader
from docx import Document
from datetime import datetime


def requires_attr(attr):

    """
    If the instance doesn't have an attribute, return None.

    Args:
        attr (str): The required attribute.

    Returns:
        function: The decorated function.
    """

    def decorator(func):
        def wrapper(self, *args, **kwargs):
            if getattr(self, attr, None):
                return func(self, *args, **kwargs)
            else: return None
        return wrapper
    return decorator


def int_to_dir(i):

    """
    Convert an integer offset to a segment name.

    Args:
        i (int): The integer offset.

    Returns:
        str: The segment directory name.
    """

    return hex(i)[2:].zfill(3)


def html_text(path, exclude=['script', 'style']):

    """
    Convert HTML to text.

    Args:
        path (str): The file path.
        exclude (list): A list of tags to ignore.

    Returns:
        str: The extracted text.
    """

    with open(path, 'rb') as fh:

        soup = BeautifulSoup(fh, 'lxml')
        for script in soup(exclude): script.extract()
        return soup.get_text()


def pdf_text(path):

    """
    Convert a PDF to text.

    Args:
        path (str): The file path.

    Returns:
        str: The extracted text.
    """

    cmd = os.path.join(config['osp']['bin'], 'pdf2txt.py')
    txt = subprocess.check_output([cmd, path])
    return txt.decode('utf8')


def docx_text(path):

    """
    Convert to plaintext with LibreOffice.

    Args:
        path (str): The file path.

    Returns:
        str: The extracted text.
    """

    with open(path, 'rb') as fh:

        r = requests.put(
            config['tika']['server'],
            headers={'Accept': 'text/plain'},
            data=fh.read()
        )

        return r.text


def pdf_date(path):

    """
    Extract a date from PDF file metadata.

    Args:
        path (str): The file path.

    Returns:
        datetime: The created date.
    """

    reader = PdfFileReader(path)

    # Get rid of `D:` prefix and timezone.
    stamp = reader.documentInfo['/CreationDate']
    match = re.search('\d+', stamp)

    return datetime.strptime(
        match.group(),
        '%Y%m%d%H%M%S'
    )


def docx_date(path):

    """
    Extract a date from DOCX file metadata.

    Args:
        path (str): The file path.

    Returns:
        datetime: The created date.
    """

    return Document(path).core_properties.created