python source code of postagging

# Copyright (c) 2017-present, Swisscom (Schweiz) AG.
# All rights reserved.
#
#Authors: Kamil Bennani-Smires, Yann Savary

import argparse
import os
import re
import warnings
from abc import ABC, abstractmethod

# NLTK imports
import nltk
from nltk.tag.util import tuple2str
from nltk.parse import CoreNLPParser

import swisscom_ai.research_keyphrase.preprocessing.custom_stanford as custom_stanford
from swisscom_ai.research_keyphrase.util.fileIO import read_file, write_string

# If you want to use spacy , install it and uncomment the following import
# import spacy


class PosTagging(ABC):
    @abstractmethod
    def pos_tag_raw_text(self, text, as_tuple_list=True):
        """
        Tokenize and POS tag a string
        Sentence level is kept in the result :
        Either we have a list of list (for each sentence a list of tuple (word,tag))
        Or a separator [ENDSENT] if we are requesting a string by putting as_tuple_list = False

        Example :
        >>from sentkp.preprocessing import postagger as pt

        >>pt = postagger.PosTagger()

        >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.')
        [
            [('Write', 'VB'), ('your', 'PRP$'), ('python', 'NN'),
            ('code', 'NN'), ('in', 'IN'), ('a', 'DT'), ('.', '.'), ('py', 'NN'), ('file', 'NN'), ('.', '.')
            ],
            [('Thank', 'VB'), ('you', 'PRP'), ('.', '.')]
        ]

        >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.', as_tuple_list=False)

        'Write/VB your/PRP$ python/NN code/NN in/IN a/DT ./.[ENDSENT]py/NN file/NN ./.[ENDSENT]Thank/VB you/PRP ./.'


        >>pt = postagger.PosTagger(separator='_')
        >>pt.pos_tag_raw_text('Write your python code in a .py file. Thank you.', as_tuple_list=False)
        Write_VB your_PRP$ python_NN code_NN in_IN a_DT ._. py_NN file_NN ._.
        Thank_VB you_PRP ._.



        :param as_tuple_list: Return result as list of list (word,Pos_tag)
        :param text:  String to POS tag
        :return: POS Tagged string or Tuple list
        """

        pass

    def pos_tag_file(self, input_path, output_path=None):

        """
        POS Tag a file.
        Either we have a list of list (for each sentence a list of tuple (word,tag))
        Or a file with the POS tagged text

        Note : The jumpline is only for readibility purpose , when reading a tagged file we'll use again
        sent_tokenize to find the sentences boundaries.

        :param input_path: path of the source file
        :param output_path: If set write POS tagged text with separator (self.pos_tag_raw_text with as_tuple_list False)
                            If not set, return list of list of tuple (self.post_tag_raw_text with as_tuple_list = True)

        :return: resulting POS tagged text as a list of list of tuple or nothing if output path is set.
        """

        original_text = read_file(input_path)

        if output_path is not None:
            tagged_text = self.pos_tag_raw_text(original_text, as_tuple_list=False)
            # Write to the output the POS-Tagged text.
            write_string(tagged_text, output_path)
        else:
            return self.pos_tag_raw_text(original_text, as_tuple_list=True)

    def pos_tag_and_write_corpora(self, list_of_path, suffix):
        """
        POS tag a list of files
        It writes the resulting file in the same directory with the same name + suffix
        e.g
        pos_tag_and_write_corpora(['/Users/user1/text1', '/Users/user1/direct/text2'] , suffix = _POS)
        will create
        /Users/user1/text1_POS
        /Users/user1/direct/text2_POS

        :param list_of_path: list containing the path (as string) of each file to POS Tag
        :param suffix: suffix to append at the end of the original filename for the resulting pos_tagged file.

        """
        for path in list_of_path:
            output_file_path = path + suffix
            if os.path.isfile(path):
                self.pos_tag_file(path, output_file_path)
            else:
                warnings.warn('file ' + output_file_path + 'does not exists')


class PosTaggingStanford(PosTagging):
    """
    Concrete class of PosTagging using StanfordPOSTokenizer and StanfordPOSTagger

    tokenizer contains the default nltk tokenizer (PhunktSentenceTokenizer).
    tagger contains the StanfordPOSTagger object (which also trigger word tokenization  see : -tokenize option in Java).

    """

    def __init__(self, jar_path, model_path_directory, separator='|', lang='en'):
        """
        :param model_path_directory: path of the model directory
        :param jar_path: path of the jar for StanfordPOSTagger (override the configuration file)
        :param separator: Separator between a token and a tag in the resulting string (default : |)

        """

        if lang == 'en':
            model_path = os.path.join(model_path_directory, 'english-left3words-distsim.tagger')
            self.sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            self.tagger = custom_stanford.EnglishStanfordPOSTagger(model_path, jar_path, java_options='-mx2g')
        elif lang == 'de':
            model_path = os.path.join(model_path_directory, 'german-hgc.tagger')
            self.sent_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
            self.tagger = custom_stanford.GermanStanfordPOSTagger(model_path, jar_path, java_options='-mx2g')
        elif lang == 'fr':
            model_path = os.path.join(model_path_directory, 'french.tagger')
            self.sent_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
            self.tagger = custom_stanford.FrenchStanfordPOSTagger(model_path, jar_path, java_options='-mx2g')
        else:
            raise ValueError('Language ' + lang + 'not handled')

        self.separator = separator

    def pos_tag_raw_text(self, text, as_tuple_list=True):
        """
        Implementation of abstract method from PosTagging
        @see PosTagging
        """
        tagged_text = self.tagger.tag_sents([self.sent_tokenizer.sentences_from_text(text)])

        if as_tuple_list:
            return tagged_text
        return '[ENDSENT]'.join(
            [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])


class PosTaggingSpacy(PosTagging):
    """
        Concrete class of PosTagging using StanfordPOSTokenizer and StanfordPOSTagger
    """

    def __init__(self, nlp=None, separator='|' ,lang='en'):
        if not nlp:
            print('Loading Spacy model')
            #  self.nlp = spacy.load(lang, entity=False)
            print('Spacy model loaded ' + lang)
        else:
            self.nlp = nlp
        self.separator = separator

    def pos_tag_raw_text(self, text, as_tuple_list=True):
        """
            Implementation of abstract method from PosTagging
            @see PosTagging
        """

        # This step is not necessary int the stanford tokenizer.
        # This is used to avoid such tags :  ('      ', 'SP')
        text = re.sub('[ ]+', ' ', text).strip()  # Convert multiple whitespaces into one

        doc = self.nlp(text)
        if as_tuple_list:
            return [[(token.text, token.tag_) for token in sent] for sent in doc.sents]
        return '[ENDSENT]'.join(' '.join(self.separator.join([token.text, token.tag_]) for token in sent) for sent in doc.sents)
    

class PosTaggingCoreNLP(PosTagging):
    """
    Concrete class of PosTagging using a CoreNLP server 
    Provides a faster way to process several documents using since it doesn't require to load the model each time.
    """

    def __init__(self, host='localhost' ,port=9000, separator='|'):
        self.parser = CoreNLPParser(url=f'http://{host}:{port}')
        self.separator = separator
    
    def pos_tag_raw_text(self, text, as_tuple_list=True):
        # Unfortunately for the moment there is no method to do sentence split + pos tagging in nltk.parse.corenlp
        # Ony raw_tag_sents is available but assumes a list of str (so it assumes the sentence are already split)
        # We create a small custom function highly inspired from raw_tag_sents to do both

        def raw_tag_text():
            """
            Perform tokenizing sentence splitting and PosTagging and keep the 
            sentence splits structure
            """
            properties = {'annotators':'tokenize,ssplit,pos'}
            tagged_data = self.parser.api_call(text, properties=properties)
            for tagged_sentence in tagged_data['sentences']:
                yield [(token['word'], token['pos']) for token in tagged_sentence['tokens']]
        
        tagged_text = list(raw_tag_text())

        if as_tuple_list:
            return tagged_text
        return '[ENDSENT]'.join(
            [' '.join([tuple2str(tagged_token, self.separator) for tagged_token in sent]) for sent in tagged_text])
        



if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Write POS tagged files, the resulting file will be written'
                                                 ' at the same location with _POS append at the end of the filename')

    parser.add_argument('tagger', help='which pos tagger to use [stanford, spacy, corenlp]')
    parser.add_argument('listing_file_path', help='path to a text file '
                                                  'containing in each row a path to a file to POS tag')
    args = parser.parse_args()

    if args.tagger == 'stanford':
        pt = PosTaggingStanford()
        suffix = 'STANFORD'
    elif args.tagger == 'spacy':
        pt = PosTaggingSpacy()
        suffix = 'SPACY'
    elif args.tagger == 'corenlp':
        pt = PosTaggingCoreNLP()
        suffix = 'CoreNLP'

    list_of_path = read_file(args.listing_file_path).splitlines()
    print('POS Tagging and writing ', len(list_of_path), 'files')
    pt.pos_tag_and_write_corpora(list_of_path, suffix)