import os
import wget
import pytest
from cdqa.utils.converters import pdf_converter, md_converter


@pytest.fixture(scope="session")
def download_test_assets(tmpdir_factory):
    assets_urls = [
        # PDF
        "https://invest.bnpparibas.com/documents/1q19-pr-12648",
        "https://invest.bnpparibas.com/documents/4q18-pr-18000",
        "https://invest.bnpparibas.com/documents/4q17-pr",
        # MD
        "https://raw.githubusercontent.com/cdqa-suite/cdQA/master/README.md",
        "https://raw.githubusercontent.com/huggingface/pytorch-transformers/master/docs/source/quickstart.md",
        "https://raw.githubusercontent.com/huggingface/pytorch-transformers/master/docs/source/migration.md",
    ]

    print("\nDownloading assets...")
    fn = tmpdir_factory.mktemp("assets_data")
    for url in assets_urls:
        wget.download(url=url, out=str(fn))
    return fn


class Test_converter:
    @pytest.fixture(autouse=True)
    def get_assets_folder(self, download_test_assets):
        self.assets_folder = download_test_assets

    def df_converter_check(self, df, include_line_breaks=False):
        errors = []
        # replace assertions by conditions
        if not df.shape == (3, 2):
            errors.append("resulting dataframe has unexpected shape.")
        if not (isinstance(df.paragraphs[0][0], str) and isinstance(df.title[0], str)):
            errors.append("paragraph column content has wrong format.")
        if include_line_breaks:
            para_len = [len(df.paragraphs[i]) for i in range(df.shape[0])]
            para_len.sort()
            if not para_len == [58, 80, 87]:
                errors.append(f"error in number of paragraphs : {para_len}")

        # assert no error message has been registered, else print messages
        assert not errors, "errors occured:\n{}".format("\n".join(errors))

    def test_md_converter(self):
        df = md_converter(directory_path=self.assets_folder)
        self.df_converter_check(df)

    def test_pdf_converter(self):
        df = pdf_converter(directory_path=self.assets_folder)
        self.df_converter_check(df)
        df_line_para = pdf_converter(
            directory_path=self.assets_folder, include_line_breaks=True
        )
        self.df_converter_check(df_line_para, True)