import os
import pickle
from tqdm import tqdm
import wikipedia
from requests.exceptions import ConnectionError
from wikipedia.exceptions import DisambiguationError, PageError

from qanta.util.environment import BONUS_ANSWER_PAGES
from qanta.util.multiprocess import _multiprocess
from qanta.datasets.quiz_bowl import BonusQuestionDatabase

def download_pages():
    bonus_questions = BonusQuestionDatabase().all_questions()
    train_answers = set()
    for q in bonus_questions.values():
        train_answers.update(q.pages)
    
    if os.path.isfile(BONUS_ANSWER_PAGES):
        with open(BONUS_ANSWER_PAGES, 'rb') as f:
            try:
                pages = pickle.load(f)
                print('loaded {} pages'.format(len(pages)))
            except EOFError:
                pages = dict()
    else:
        pages = dict()

    train_answers = train_answers - set(pages.keys())

    for answer in tqdm(train_answers):
        if answer in pages:
            continue
        try:
            page = wikipedia.page(answer)
        except (DisambiguationError, PageError, ConnectionError) as e:
            if isinstance(e, DisambiguationError):
                pages[answer] = None
                continue
            if isinstance(e, PageError):
                pages[answer] = None
                continue
            if isinstance(e, ConnectionError):
                break
        try:
            pages[answer] = [page.title, page.content, page.links,
                    page.summary, page.categories, page.url, page.pageid]
        except ConnectionError:
            break

    with open(BONUS_ANSWER_PAGES, 'wb') as f:
        pickle.dump(pages, f)
    
if __name__ == '__main__':
    download_pages()