python source code of keyphrase_test

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Python File Template 
"""
from nltk.internals import find_jars_within_path
from nltk.tag import StanfordPOSTagger
from nltk import word_tokenize

import os
import re
import shutil

import nltk
import xml.etree.ElementTree as ET

import keyphrase.dataset.dataset_utils as utils
from emolga.utils.generic_utils import get_from_module


from emolga.dataset.build_dataset import deserialize_from_file, serialize_to_file
import numpy as np

from keyphrase.dataset import dataset_utils

__author__ = "Rui Meng"
__email__ = "rui.meng@pitt.edu"

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
class Document(object):
    def __init__(self):
        self.name       = ''
        self.title      = ''
        self.text       = ''
        self.phrases    = []

    def __str__(self):
        return '%s\n\t%s\n\t%s' % (self.title, self.text, str(self.phrases))


class DataLoader(object):
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
        self.name    = self.__class__.__name__
        self.basedir = self.basedir
        self.doclist = []

    def get_docs(self, return_dict=True):
        '''
        :return: a list of dict instead of the Document object
        '''
        class_name = self.__class__.__name__.lower()
        if class_name == 'kdd' or class_name == 'www' or class_name == 'umd':
            self.load_xml(self.textdir)
        else:
            self.load_text(self.textdir)
            self.load_keyphrase(self.keyphrasedir)

        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name']        = d.name
            newd['abstract']    = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title']       = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword']     = ';'.join(d.phrases)
            doclist.append(newd)

        if return_dict:
            return doclist
        else:
            return self.doclist

    def __call__(self, idx2word, word2idx, type = 1):
        self.get_docs()

        pairs = []

        for doc in self.doclist:
            try:
                title= utils.get_tokens(doc.title, type)
                text = utils.get_tokens(doc.text, type)
                if type == 0:
                    title.append('<eos>')
                elif type == 1:
                    title.append('.')

                title.extend(text)
                text = title

                # trunk, many texts are too long, would lead to out-of-memory
                if len(text) > 1500:
                    text = text[:1500]

                keyphrases = [utils.get_tokens(k, type) for k in doc.phrases]
                pairs.append((text, keyphrases))

            except UnicodeDecodeError:
                print('UnicodeDecodeError detected! %s' % doc.name)
            # print(text)
            # print(keyphrases)
            # print('*'*50)
        dataset = utils.build_data(pairs, idx2word, word2idx)

        return dataset, self.doclist

    def load_xml(self, xmldir):
        '''
        for KDD/WWW/UMD only
        :return: doclist
        '''
        for filename in os.listdir(xmldir):
            with open(xmldir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.xml')]

                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = textfile.readlines()
                    xml = ''.join([filter(lambda x: x in printable, l) for l in lines])
                    root = ET.fromstring(xml)

                    doc.title = root.findall("title")[0].text
                    doc.abstract = root.findall("abstract")[0].text
                    doc.phrases = [n.text for n in root.findall("*/tag")]

                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename )

    def load_text(self, textdir):
        for fid, filename in enumerate(os.listdir(textdir)):
            with open(textdir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.txt')]

                import string
                printable = set(string.printable)

                # print((textdir+filename))
                try:
                    lines = textfile.readlines()

                    lines = [list(filter(lambda x: x in printable, l)) for l in lines]

                    title = ''.join(lines[0]).encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # the 2nd line is abstract title
                    text  = (' '.join([''.join(l).strip() for l in lines[1:]])).encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # if lines[1].strip().lower() != 'abstract':
                    #     print('Wrong title detected : %s' % (filename))

                    doc.title = title
                    doc.text  = text
                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename )

    def load_keyphrase(self, keyphrasedir):
        for did,doc in enumerate(self.doclist):
            phrase_set = set()
            if os.path.exists(self.keyphrasedir + doc.name + '.keyphrases'):
                with open(keyphrasedir+doc.name+'.keyphrases') as keyphrasefile:
                    phrase_set.update([phrase.strip() for phrase in keyphrasefile.readlines()])
            # else:
            #     print(self.keyphrasedir + doc.name + '.keyphrases Not Found')

            if os.path.exists(self.keyphrasedir + doc.name + '.keywords'):
                with open(keyphrasedir + doc.name + '.keywords') as keyphrasefile:
                    phrase_set.update([phrase.strip() for phrase in keyphrasefile.readlines()])
            # else:
            #     print(self.keyphrasedir + doc.name + '.keywords Not Found')

            doc.phrases = list(phrase_set)

    def load_testing_data_postag(self, word2idx):
        print('Loading testing dataset %s from %s' % (self.name, self.postag_datadir))
        text_file_paths = [self.text_postag_dir + n_ for n_ in os.listdir(self.text_postag_dir)]
        keyphrase_file_paths = [self.keyphrase_postag_dir + n_ for n_ in os.listdir(self.keyphrase_postag_dir)]

        def load_postag_text_(path):
            with open(path, 'r') as f:
                tokens = ' '.join(f.readlines()).split(' ')
                text = [t.split('_')[0] for t in tokens]
                postag = [t.split('_')[1] for t in tokens]
            return text, postag

        def load_keyphrase_(path):
            with open(path, 'r') as f:
                keyphrase_str = ';'.join([l.strip() for l in f.readlines()])
                return dataset_utils.process_keyphrase(keyphrase_str)

        texts = [load_postag_text_(f_) for f_ in text_file_paths]
        keyphrases = [load_keyphrase_(f_) for f_ in keyphrase_file_paths]

        instance = dict(source_str=[], target_str=[], source=[], source_postag=[], target=[], target_c=[])

        for (source, postag),target in zip(texts, keyphrases):
            A = [word2idx[w] if w in word2idx else word2idx['<unk>'] for w in source]
            B = [[word2idx[w] if w in word2idx else word2idx['<unk>'] for w in p] for p in target]

            instance['source_str'] += [source]
            instance['target_str'] += [target]
            instance['source'] += [A]
            instance['source_postag'] += [postag]
            instance['target'] += [B]

        return instance

    def load_testing_data(self, word2idx):
        print('Loading testing dataset %s from %s' % (self.name, self.datadir))
        text_file_paths = [self.textdir + n_ for n_ in os.listdir(self.textdir) if n_.endswith('.txt')]
        # here is problematic. keep phrases in either '.txt' or 'keyphrases', but don't keep both files (kp20k uses both, delete either one would be fine). Keep '.txt' only in the future
        keyphrase_file_paths = [self.keyphrasedir + n_ for n_ in os.listdir(self.keyphrasedir) if n_.endswith('.txt')]
        if len(keyphrase_file_paths) == 0:
            keyphrase_file_paths = [self.keyphrasedir + n_ for n_ in os.listdir(self.keyphrasedir) if n_.endswith('.keyphrases')]

        def _load_text(path):
            with open(path, 'r') as f:
                text = ' '.join(f.readlines()).split(' ')
            return text

        def _load_keyphrase(path):
            with open(path, 'r') as f:
                keyphrase_str = [l.strip().split(' ') for l in f.readlines()]
                return keyphrase_str

        texts = [_load_text(f_) for f_ in text_file_paths]
        keyphrases = [_load_keyphrase(f_) for f_ in keyphrase_file_paths]

        instance = dict(source_str=[], target_str=[], source=[], source_postag=[], target=[], target_c=[])

        for source, target in zip(texts, keyphrases):
            A = [word2idx[w] if w in word2idx else word2idx['<unk>'] for w in source]
            B = [[word2idx[w] if w in word2idx else word2idx['<unk>'] for w in p] for p in target]

            instance['source_str'] += [source]
            instance['target_str'] += [target]
            instance['source'] += [A]
            instance['source_postag'] += [] # set to be empty
            instance['target'] += [B]

        return instance

class INSPEC(DataLoader):
    def __init__(self, **kwargs):
        super(INSPEC, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/INSPEC'
        # self.textdir = self.datadir + '/all_texts/'
        # self.keyphrasedir = self.datadir + '/gold_standard_keyphrases_2/'

        self.textdir = self.datadir + '/test_texts/'
        self.keyphrasedir = self.datadir + '/gold_standard_test/'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/inspec/'
        self.text_postag_dir = self.postag_datadir + 'text/'
        self.keyphrase_postag_dir = self.postag_datadir + 'keyphrase/'

class NUS(DataLoader):
    def __init__(self, **kwargs):
        super(NUS, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/NUS'
        self.textdir = self.datadir + '/all_texts/'
        self.keyphrasedir = self.datadir + '/gold_standard_keyphrases/'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/nus/'
        self.text_postag_dir = self.postag_datadir + 'text/'
        self.keyphrase_postag_dir = self.postag_datadir + 'keyphrase/'

    def export(self):
        '''
        parse the original dataset into two folders: text and gold_standard_keyphrases
        :return:
        '''
        originaldir = self.datadir+'/original'
        for paper_id in os.listdir(originaldir):
            if os.path.isfile(originaldir+'/'+paper_id):
                continue
            # copy text to all_texts/
            text_file = originaldir+'/'+paper_id+'/'+paper_id+'.txt'
            shutil.copy2(text_file, self.textdir+'/'+paper_id+'.txt')
            # load keyphrases
            keyphrases = set()
            keyphrase_files = [originaldir+'/'+paper_id+'/'+paper_id+'.kwd']
            reader_phrase_dir = originaldir+'/'+paper_id+'/KEY/'
            for key_file in os.listdir(reader_phrase_dir):
                keyphrase_files.append(reader_phrase_dir+key_file)
            for key_file in keyphrase_files:
                with open(key_file, 'r') as f:
                    keyphrases.update(set([l.strip() for l in f.readlines()]))
            # write into gold_standard_keyphrases/
            if os.path.exists(self.keyphrasedir + paper_id + '.keyphrases'):
                with open(self.keyphrasedir + paper_id + '.keyphrases', 'w') as f:
                    for key in list(keyphrases):
                        if key.find(' ') != -1:
                            f.write(key+'\n')
            # else:
            #     print(self.keyphrasedir + paper_id + '.keyphrases Not Found')

            if os.path.exists(self.keyphrasedir + paper_id + '.keywords'):
                with open(self.keyphrasedir + paper_id + '.keywords', 'w') as f:
                    for key in list(keyphrases):
                        if key.find(' ') == -1:
                            f.write(key+'\n')
            # else:
            #     print(self.keyphrasedir + paper_id + '.keywords Not Found')
    def get_docs(self, only_abstract=True, return_dict=True):
        '''
        :return: a list of dict instead of the Document object
        The keyphrase in SemEval is already stemmed
        '''
        for filename in os.listdir(self.keyphrasedir):
            if not filename.endswith('keyphrases'):
                continue
            doc = Document()
            doc.name = filename[:filename.find('.keyphrases')]
            phrase_set = set()
            if os.path.exists(self.keyphrasedir + doc.name + '.keyphrases'):
                with open(self.keyphrasedir+doc.name+'.keyphrases') as keyphrasefile:
                    phrase_set.update([phrase.strip() for phrase in keyphrasefile.readlines()])
            # else:
            #     print(self.keyphrasedir + doc.name + '.keyphrases Not Found')

            if os.path.exists(self.keyphrasedir + doc.name + '.keywords'):
                with open(self.keyphrasedir + doc.name + '.keywords') as keyphrasefile:
                    phrase_set.update([phrase.strip() for phrase in keyphrasefile.readlines()])

            doc.phrases = list(phrase_set)
            self.doclist.append(doc)

        for d in self.doclist:
            with open(self.textdir + d.name + '.txt', 'r') as f:
                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = f.readlines()

                    lines = [filter(lambda x: x in printable, l) for l in lines]

                    # 1st line is title
                    title = lines[0].encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # find abstract
                    index_abstract = None
                    for id, line in enumerate(lines):
                        if line.lower().strip().endswith('abstract') or line.lower().strip().startswith('abstract'):
                            index_abstract = id
                            break
                    if index_abstract == None:
                        print('abstract not found: %s' % d.name)
                        index_abstract = 1

                    # find introduction
                    if only_abstract:
                        index_introduction = len(lines)
                        for id, line in enumerate(lines):
                            if line.lower().strip().endswith('introduction'):
                                index_introduction = id
                                break
                        if index_introduction == len(lines):
                            print('Introduction not found: %s' % d.name)

                    # 2nd line is abstract title
                    text  = (' '.join(lines[2 : index_introduction])).encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # if lines[1].strip().lower() != 'abstract':
                    #     print('Wrong title detected : %s' % (filename))

                    d.title = title
                    d.text  = text

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % self.self.textdir + d.name + '.txt.final' )


        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name']        = d.name
            newd['abstract']    = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title']       = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword']     = ';'.join(d.phrases)
            doclist.append(newd)

        if return_dict:
            return doclist
        else:
            return self.doclist

class SemEval(DataLoader):
    def __init__(self, **kwargs):
        super(SemEval, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/SemEval'
        # self.textdir = self.datadir + '/all_texts/'
        # self.keyphrasedir = self.datadir + '/gold_standard_keyphrases_3/'
        self.textdir = self.datadir + '/test/'
        self.keyphrasedir = self.datadir + '/test_answer/test.combined.stem.final'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/semeval/'
        self.text_postag_dir = self.postag_datadir + 'text/'
        self.keyphrase_postag_dir = self.postag_datadir + 'keyphrase/'

    """
    def get_docs(self, only_abstract=True, returnDict=True):
        '''
        :return: a list of dict instead of the Document object
        The keyphrase in SemEval is already stemmed
        '''

        if self.keyphrasedir.endswith('test.combined.stem.final'):
            with open(self.keyphrasedir, 'r') as kp:
                lines = kp.readlines()
                for line in lines:
                    d = Document()
                    d.name = line[:line.index(':')].strip()
                    d.phrases = line[line.index(':')+1:].split(',')
                    self.doclist.append(d)

        for d in self.doclist:
            with open(self.textdir + d.name + '.txt', 'r') as f:
                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = f.readlines()

                    lines = [filter(lambda x: x in printable, l) for l in lines]

                    # 1st line is title
                    title = lines[0].encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # find abstract
                    index_abstract = None
                    for id, line in enumerate(lines):
                        if line.lower().strip().endswith('abstract') or line.lower().strip().startswith('abstract'):
                            index_abstract = id
                            break
                    if index_abstract == None:
                        print('abstract not found: %s' % d.name)
                        index_abstract = 1

                    # find introduction
                    if only_abstract:
                        index_introduction = len(lines)
                        for id, line in enumerate(lines):
                            if line.lower().strip().endswith('introduction'):
                                index_introduction = id
                                break
                        if index_introduction == len(lines):
                            print('Introduction not found: %s' % d.name)

                    # 2nd line is abstract title
                    text  = (' '.join(lines[2 : index_introduction])).encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # if lines[1].strip().lower() != 'abstract':
                    #     print('Wrong title detected : %s' % (filename))

                    d.title = title
                    d.text  = text

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % self.self.textdir + d.name + '.txt.final' )


        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name']        = d.name
            newd['abstract']    = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title']       = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword']     = ';'.join(d.phrases)
            doclist.append(newd)

        if returnDict:
            return doclist
        else:
            return self.doclist
    """

class KRAPIVIN(DataLoader):
    def __init__(self, **kwargs):
        super(KRAPIVIN, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/KRAPIVIN'
        self.textdir = self.datadir + '/all_texts/'
        self.keyphrasedir = self.datadir + '/gold_standard_keyphrases/'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/krapivin/'
        self.text_postag_dir = self.postag_datadir + 'text/'
        self.keyphrase_postag_dir = self.postag_datadir + 'keyphrase/'

    def load_text(self, textdir):
        for filename in os.listdir(textdir):
            with open(textdir+filename) as textfile:
                doc = Document()
                doc.name = filename[:filename.find('.txt')]

                import string
                printable = set(string.printable)

                # print((filename))
                try:
                    lines = textfile.readlines()

                    lines = [list(filter(lambda x: x in printable, l)) for l in lines]

                    title = ''.join(lines[1]).encode('ascii', 'ignore').decode('ascii', 'ignore')
                    # the 2nd line is abstract title
                    text  = ''.join(lines[3]).encode('ascii', 'ignore').decode('ascii', 'ignore')

                    # if lines[1].strip().lower() != 'abstract':
                    #     print('Wrong title detected : %s' % (filename))

                    doc.title = title
                    doc.text  = text
                    self.doclist.append(doc)

                except UnicodeDecodeError:
                    print('UnicodeDecodeError detected! %s' % filename )

class KDD(DataLoader):
    def __init__(self, **kwargs):
        super(KDD, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/KDD'
        self.xmldir = self.datadir + '/acmparsed/'
        self.textdir = self.datadir + '/acmparsed/'
        self.keyphrasedir = self.datadir + '/acmparsed/'

class WWW(DataLoader):
    def __init__(self, **kwargs):
        super(WWW, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/WWW'
        self.xmldir = self.datadir + '/acmparsed/'
        self.textdir = self.datadir + '/acmparsed/'
        self.keyphrasedir = self.datadir + '/acmparsed/'

class UMD(DataLoader):
    def __init__(self, **kwargs):
        super(UMD, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/UMD'
        self.xmldir = self.datadir + '/acmparsed/'
        self.textdir = self.datadir + '/contentsubset/'
        self.keyphrasedir = self.datadir + '/gold/'

class DUC(DataLoader):
    def __init__(self, **kwargs):
        super(DUC, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/DUC/'
        self.textdir = self.datadir + '/all_texts/'
        self.keyphrasedir = self.datadir + '/gold_standard_keyphrases/'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/duc/'
        self.text_postag_dir = self.postag_datadir + 'text/'
        self.keyphrase_postag_dir = self.postag_datadir + 'keyphrase/'

    def export_text_phrase(self):
        all_phrase_file = self.basedir + '/dataset/keyphrase/testing-data/DUC/DUC2001LabeledKeyphrase.txt'

        duc_set = set()
        with open(all_phrase_file, 'r') as all_pf:
            for line in all_pf.readlines():
                line    = line.strip()
                duc_id  = line[:line.find('@')].strip()
                phrases = filter(lambda x:len(x.strip()) > 0, line[line.find('@')+1 : ].split(';'))
                # print(duc_id)
                # print(phrases)

                with open(self.keyphrasedir + duc_id + '.keyphrases', 'w') as pf:
                    pf.write('\n'.join(phrases))

                duc_set.add(duc_id)

        # delete the irrelevant files
        count = 0
        for text_file in os.listdir(self.datadir + '/original/'):
            if text_file in duc_set:
                if text_file.startswith('AP'):
                    print('*' * 50)
                    print(text_file)
                count += 1

                with open(self.datadir + '/original/' + text_file, 'r') as tf:
                    source = ' '.join([l.strip() for l in tf.readlines()])

                    m = re.search(r'<HEAD>(.*?)</HEAD>', source, flags=re.IGNORECASE)
                    if m  == None:
                        m = re.search(r'<HEADLINE>(.*?)</HEADLINE>', source, flags=re.IGNORECASE)
                    if m  == None:
                        m = re.search(r'<HL>(.*?)</HL>', source, flags=re.IGNORECASE)
                    if m  == None:
                        m = re.search(r'<H3>(.*?)</H3>', source, flags=re.IGNORECASE)

                    title = m.group(1)
                    title = re.sub('<.*?>', '', title).strip()
                    if text_file.startswith('FT') and title.find('/') > 0:
                        title = title[title.find('/')+1:].strip()

                    m = re.search(r'<TEXT>(.*?)</TEXT>', source, flags=re.IGNORECASE)
                    text = m.group(1)
                    text = re.sub('<.*?>', '', text).strip()

                    if text_file.startswith('AP'):
                        print(title)
                        print(text)

                    with open(self.textdir + text_file + '.txt', 'w') as target_tf:
                        target_tf.write(title + '\n' + text)

            # else:
            #     print('Delete!')
            #     os.remove(self.textdir+text_file)
        print(count)

class KP20k(DataLoader):
    def __init__(self, **kwargs):
        super(KP20k, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/baseline-data/kp20k/'
        self.textdir = self.datadir + '/text/'
        self.keyphrasedir = self.datadir + '/keyphrase/'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/kp20k/'
        self.text_postag_dir = self.postag_datadir + 'text/'
        self.keyphrase_postag_dir = self.postag_datadir + 'keyphrase/'

    def get_docs(self, return_dict=True):
        '''
        :return: a list of dict instead of the Document object
        '''
        for fname in os.listdir(self.textdir):
            d = Document()
            d.name = fname
            with open(self.textdir + fname, 'r') as textfile:
                lines = textfile.readlines()
                d.title = lines[0].strip()
                d.text = ' '.join(lines[1:])
            with open(self.keyphrasedir + fname, 'r') as phrasefile:
                d.phrases = [l.strip() for l in phrasefile.readlines()]
            self.doclist.append(d)

        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name'] = d.name
            newd['abstract'] = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title'] = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword'] = ';'.join(d.phrases)
            doclist.append(newd)

        if return_dict:
            return doclist
        else:
            return self.doclist

class KP2k_NEW(DataLoader):
    '''
    18,716 docs after filtering (no keyword etc)
    '''
    def __init__(self, **kwargs):
        super(KP2k_NEW, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/new_kp2k_for_theano_model/'
        self.textdir = self.datadir + '/text/'
        self.keyphrasedir = self.datadir + '/keyphrase/'

    def get_docs(self, return_dict=True):
        '''
        :return: a list of dict instead of the Document object
        '''
        for fname in os.listdir(self.textdir):
            d = Document()
            d.name = fname
            with open(self.textdir + fname, 'r') as textfile:
                lines = textfile.readlines()
                d.title = lines[0].strip()
                d.text = ' '.join(lines[1:])
            with open(self.keyphrasedir + fname, 'r') as phrasefile:
                d.phrases = [l.strip() for l in phrasefile.readlines()]
            self.doclist.append(d)

        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name'] = d.name
            newd['abstract'] = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title'] = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword'] = ';'.join(d.phrases)
            doclist.append(newd)

        if return_dict:
            return doclist
        else:
            return self.doclist

class IRBooks(DataLoader):
    def __init__(self, **kwargs):
        super(IRBooks, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/IRbooks/'
        self.textdir = self.datadir + '/ir_textbook.txt'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/IRbooks/'
        self.text_postag_dir = self.postag_datadir + 'text/'

    def get_docs(self, return_dict=True):
        '''
        :return: a list of dict instead of the Document object
        '''

        with open(self.textdir, 'r') as textfile:
            for line in textfile.readlines():
                d = Document()
                text_id         = line[:line.index('\t')]
                text_content    = line[line.index('\t'):]
                d.name          = text_id.strip()
                d.title         = text_content[:text_content.index('  ')].strip()
                d.text          = text_content[text_content.index('  '):].strip()
                d.phrases       = []

                if len(d.text.split()) > 2000:
                    print('[length=%d]%s - %s' % (len(d.text.split()), d.name, d.title))
                self.doclist.append(d)

                # with open(self.keyphrasedir + fname, 'r') as phrasefile:
                #     d.phrases = [l.strip() for l in phrasefile.readlines()]

        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name'] = d.name
            newd['abstract'] = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title'] = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword'] = ';'.join(d.phrases)
            doclist.append(newd)

        if return_dict:
            return doclist
        else:
            return self.doclist

class Quora(DataLoader):
    def __init__(self, **kwargs):
        super(Quora, self).__init__(**kwargs)
        self.datadir = self.basedir + '/dataset/keyphrase/testing-data/Quora/'
        self.textdir = self.datadir + '/'

        self.postag_datadir = self.basedir + '/dataset/keyphrase/baseline-data/Quora/'
        self.text_postag_dir = self.postag_datadir + 'text/'

    def get_docs(self, return_dict=True):
        '''
        :return: a list of dict instead of the Document object
        '''
        for textfile_name in os.listdir(self.textdir):
            with open(self.textdir+textfile_name, 'r') as textfile:
                d = Document()
                d.name = textfile_name[:textfile_name.find('.')].strip()
                d.title = ''
                d.text = ' '.join([l.strip() for l in textfile.readlines()])
                d.phrases = []
                self.doclist.append(d)

        doclist = []
        for d in self.doclist:
            newd = {}
            newd['name'] = d.name
            newd['abstract'] = re.sub('[\r\n]', ' ', d.text).strip()
            newd['title'] = re.sub('[\r\n]', ' ', d.title).strip()
            newd['keyword'] = ';'.join(d.phrases)
            doclist.append(newd)

        if return_dict:
            return doclist
        else:
            return self.doclist


# aliases
inspec = INSPEC
nus = NUS
semeval = SemEval
krapivin = KRAPIVIN
kdd = KDD
www = WWW
umd = UMD
kp20k = KP20k
kp2k_new = KP2k_NEW
duc = DUC
irbooks = IRBooks
quora = Quora # for Runhua's data

def testing_data_loader(identifier, kwargs=None):
    '''
    load testing data dynamically
    :return:
    '''
    data_loader = get_from_module(identifier, globals(), 'data_loader', instantiate=True,
                           kwargs=kwargs)
    return data_loader

def load_additional_testing_data(testing_names, idx2word, word2idx, config, postagging=True, process_type=1):
    test_sets           = {}

    # rule out the ones appear in testing data
    for dataset_name in testing_names:

        if os.path.exists(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'):
            test_set = deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')
            print('Loading testing dataset %s from %s' % (dataset_name, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl'))
        else:
            print('Creating testing dataset %s: %s' % (dataset_name, config['path'] + '/dataset/keyphrase/' + config[
                'data_process_name'] + dataset_name + '.testing.pkl'))
            dataloader          = testing_data_loader(dataset_name, kwargs=dict(basedir=config['path']))
            records             = dataloader.get_docs()
            records, pairs, _   = utils.load_pairs(records, process_type=process_type, do_filter=False)
            test_set            = utils.build_data(pairs, idx2word, word2idx)

            test_set['record']  = records

            if postagging:
                tagged_sources = get_postag_with_record(records, pairs)
                test_set['tagged_source']   = [[t[1] for t in s] for s in tagged_sources]

                if hasattr(dataloader, 'text_postag_dir') and dataloader.__getattribute__('text_postag_dir') != None:
                    print('Exporting postagged data to %s' % (dataloader.text_postag_dir))
                    if not os.path.exists(dataloader.text_postag_dir):
                        os.makedirs(dataloader.text_postag_dir)
                    for r_, p_, s_ in zip(records, pairs, tagged_sources):
                        with open(dataloader.text_postag_dir+ '/' + r_['name'] + '.txt', 'w') as f:
                            output_str = ' '.join([w+'_'+t for w,t in s_])
                            f.write(output_str)
                else:
                    print('text_postag_dir not found, no export of postagged data')

            serialize_to_file(test_set, config['path'] + '/dataset/keyphrase/'+config['data_process_name']+dataset_name+'.testing.pkl')

        test_sets[dataset_name] = test_set

    return test_sets


from nltk.stem.porter import *
from keyphrase.dataset import dataset_utils
def check_data():
    from keyphrase.config import setup_keyphrase_stable
    config = setup_keyphrase_stable()
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])

    for dataset_name in config['testing_datasets']:
        print('*' * 50)
        print(dataset_name)

        number_groundtruth = 0
        number_present_groundtruth = 0

        loader = testing_data_loader(dataset_name, kwargs=dict(basedir = config['path']))

        if dataset_name == 'nus':
            docs   = loader.get_docs(only_abstract = True, return_dict=False)
        else:
            docs   = loader.get_docs(return_dict=False)

        stemmer = PorterStemmer()

        for id,doc in enumerate(docs):

            text_tokens = dataset_utils.get_tokens(doc.title.strip()+' '+ doc.text.strip())
            # if len(text_tokens) > 1500:
            #     text_tokens = text_tokens[:1500]
            print('[%d] length= %d' % (id, len(doc.text)))

            stemmed_input = [stemmer.stem(t).strip().lower() for t in text_tokens]

            phrase_str = ';'.join([l.strip() for l in doc.phrases])
            phrases = dataset_utils.process_keyphrase(phrase_str)
            targets = [[stemmer.stem(w).strip().lower() for w in target] for target in phrases]

            present_targets = []

            for target in targets:
                keep = True
                # whether do filtering on groundtruth phrases. if config['target_filter']==None, do nothing
                match = None
                for i in range(len(stemmed_input) - len(target) + 1):
                    match = None
                    for j in range(len(target)):
                        if target[j] != stemmed_input[i + j]:
                            match = False
                            break
                    if j == len(target) - 1 and match == None:
                        match = True
                        break

                if match == True:
                    # if match and 'appear-only', keep this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and True
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and False
                elif match == False:
                    # if not match and 'appear-only', discard this phrase
                    if config['target_filter'] == 'appear-only':
                        keep = keep and False
                    # if not match and 'non-appear-only', keep this phrase
                    elif config['target_filter'] == 'non-appear-only':
                        keep = keep and True

                if not keep:
                    continue

                present_targets.append(target)

            number_groundtruth += len(targets)
            number_present_groundtruth += len(present_targets)

        print('number_groundtruth='+str(number_groundtruth))
        print('number_present_groundtruth='+str(number_present_groundtruth))

        '''
        test_set, doclist = loader(idx2word, word2idx, type=0)
        test_data_plain = zip(*(test_set['source'], test_set['target'], doclist))

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o, doc = test_data_plain[idx]
            target = doc.phrases

            if len(doc.text) < 50000:
                print('%d - %d : %d   \n\tname=%s, \n\ttitle=%s, \n\ttext=%s, \n\tlen(keyphrase)=%d' % (idx, len(test_s_o), len(test_t_o), doc.name, doc.title, doc.text, len(''.join(target))))
                print(doc)

            if (len(target)!=0 and len(''.join(target))/len(target) < 3):
                print('!' * 100)
                print('Error found')
                print('%d - %d : %d   name=%s, title=%d, text=%d, len(keyphrase)=%d' % (idx, len(test_s_o), len(test_t_o), doc.name, len(doc.title), len(doc.text), len(''.join(target))))
        '''

def add_padding(data):
    shapes = [np.asarray(sample).shape for sample in data]
    lengths = [shape[0] for shape in shapes]

    # make sure there's at least one zero at last to indicate the end of sentence <eol>
    max_sequence_length = max(lengths) + 1
    rest_shape = shapes[0][1:]
    padded_batch = np.zeros(
        (len(data), max_sequence_length) + rest_shape,
        dtype='int32')
    for i, sample in enumerate(data):
        padded_batch[i, :len(sample)] = sample
    return padded_batch

def split_into_multiple_and_padding(data_s_o, data_t_o):
    data_s = []
    data_t = []
    for s, t in zip(data_s_o, data_t_o):
        for p in t:
            data_s += [s]
            data_t += [p]

    data_s = add_padding(data_s)
    data_t = add_padding(data_t)
    return data_s, data_t

def get_postag_with_record(records, pairs):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx, (record, pair) in enumerate(zip(records, pairs)):  # len(test_data_plain)
        print('*' * 100)
        print('File: '  + record['name'])
        print('Input: ' + str(pair[0]))
        text = pos_tagger.tag(pair[0])
        print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text)))
        tagged_source.append(text)

    return tagged_source


def get_postag_with_index(sources, idx2word, word2idx):
    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    print(path)
    # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
    # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'

    stanford_dir = jar.rpartition('/')[0]
    stanford_jars = find_jars_within_path(stanford_dir)
    pos_tagger._stanford_jar = ':'.join(stanford_jars)

    tagged_source = []
    # Predict on testing data
    for idx in xrange(len(sources)):  # len(test_data_plain)
        test_s_o = sources[idx]
        source_text = keyphrase_utils.cut_zero(test_s_o, idx2word)
        text = pos_tagger.tag(source_text)
        print('[%d/%d] : %s' % (idx, len(sources), str(text)))

        tagged_source.append(text)

    return tagged_source

def check_postag(config):
    train_set, validation_set, test_set, idx2word, word2idx = deserialize_from_file(config['dataset'])

    path = os.path.dirname(__file__)
    path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/'
    jar = path + '/stanford-postagger.jar'
    model = path + '/models/english-bidirectional-distsim.tagger'
    pos_tagger = StanfordPOSTagger(model, jar)

    for dataset_name in config['testing_datasets']:
        # override the original test_set
        # test_set = load_testing_data(dataset_name, kwargs=dict(basedir=config['path']))(idx2word, word2idx, config['preprocess_type'])

        test_sets = load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config)
        test_set = test_sets[dataset_name]

        # print(dataset_name)
        # print('Avg length=%d, Max length=%d' % (np.average([len(s) for s in test_set['source']]), np.max([len(s) for s in test_set['source']])))
        test_data_plain = zip(*(test_set['source'], test_set['target']))

        test_size = len(test_data_plain)

        # Alternatively to setting the CLASSPATH add the jar and model via their path:
        jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar'
        # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger'
        model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger'
        pos_tagger = StanfordPOSTagger(model, jar)

        for idx in xrange(len(test_data_plain)):  # len(test_data_plain)
            test_s_o, test_t_o = test_data_plain[idx]

            source = keyphrase_utils.cut_zero(test_s_o, idx2word)

            print(source)

            # Add other jars from Stanford directory
            stanford_dir = jar.rpartition('/')[0]
            stanford_jars = find_jars_within_path(stanford_dir)
            pos_tagger._stanford_jar = ':'.join(stanford_jars)

            text = pos_tagger.tag(source)
            print(text)

if __name__ == '__main__':
    # config = setup_keyphrase_all()
    #
    # loader = testing_data_loader('duc', kwargs=dict(basedir=config['path']))
    # loader.export_text_phrase()
    # docs   = loader.get_docs()

    check_data()
    pass

    # check_postag(config)

    # train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(config['dataset'])
    # load_additional_testing_data(config['testing_datasets'], idx2word, word2idx, config)

            # if len(test_t_o) < 3:
            #
            #     doc.text = re.sub(r'[\r\n\t]', ' ', doc.text)
            #     print('name:\t%s' % doc.name)
            #     print('text:\t%s' % doc.text)
            #     print('phrase:\t%s' % str(doc.phrases))
            # if idx % 100 == 0:
            #     print(test_data_plain[idx])