Python nltk.stem.lancaster.LancasterStemmer() Examples

The following are 7 code examples of nltk.stem.lancaster.LancasterStemmer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.stem.lancaster , or try the search function .
Example #1
Source File: Data.py    From NLU with MIT License 6 votes vote down vote up
def __init__(self):

        ###############################################################
        #
        # Sets up all default requirements and placeholders 
        # needed for the NLU engine to run. 
        #
        # - Helpers: Useful global functions
        # - Logging: Logging class
        # - LancasterStemmer: Word stemmer
        #
        ###############################################################
        
        self.ignore  = [',','.','!','?']
        
        self.Helpers = Helpers()
        self._confs  = self.Helpers.loadConfigs()
        self.LogFile = self.Helpers.setLogFile(self._confs["aiCore"]["Logs"]+"JumpWay/")
        
        self.LancasterStemmer = LancasterStemmer() 
Example #2
Source File: Users.py    From NLU with MIT License 6 votes vote down vote up
def __init__(self, Logging, LogFile):
        
        self.LancasterStemmer = LancasterStemmer()

        self.Logging          = Logging
        self.LogFile          = LogFile
        
        self.ignore  = [
            '?',
            '!'
        ]
        
        self.Logging.logMessage(
            self.LogFile,
            "Data",
            "INFO",
            "Data Helper Ready") 
Example #3
Source File: matcher.py    From text-matcher with GNU General Public License v3.0 6 votes vote down vote up
def getTokens(self, removeStopwords=True):
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer.
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1]
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        stemmer = LancasterStemmer()
        tokens = [ stemmer.stem(token) for token in tokens ]
        if not removeStopwords:
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens 
Example #4
Source File: Data.py    From NLU with MIT License 5 votes vote down vote up
def extract(self, data=None, splitIt=False):

        ###############################################################
        #
        # Extracts words from sentences, stripping out characters in 
        # the ignore list above
        # 
        # https://www.nltk.org/_modules/nltk/stem/lancaster.html
        # http://insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
        #
        ###############################################################
        
        return [self.LancasterStemmer.stem(word) for word in (data.split() if splitIt == True else data) if word not in self.ignore] 
Example #5
Source File: Users.py    From NLU with MIT License 5 votes vote down vote up
def extract(self, data=None, lowerIt=True, splitIt=False, ignoreWords=False):
        
        if ignoreWords:
            return [self.LancasterStemmer.stem(word if lowerIt == False else word.lower()) for word in (data.split() if splitIt == True else data) if word not in self.ignore]
        else:
            return [self.LancasterStemmer.stem(word if lowerIt == False else word.lower()) for word in (data.split() if splitIt == True else data)] 
Example #6
Source File: Mitie.py    From NLU with MIT License 5 votes vote down vote up
def __init__(self):

        ###############################################################
        #
        # Sets up all default requirements
        #
        # - Helpers: Useful global functions
        # - LancasterStemmer: Word stemmer
        #
        ###############################################################
        
        self.Helpers = Helpers()
        self._confs  = self.Helpers.loadConfigs()

        self.stemmer = LancasterStemmer() 
Example #7
Source File: adversarial_squad.py    From adversarial-squad with MIT License 5 votes vote down vote up
def get_vocabularies(dataset, vocab_file, nearby_file):
  """Create map from example ID to (basic_words, nearby_words."""
  with open(vocab_file) as f:
    basic_vocab = [line.strip() for line in f]
  with open(nearby_file) as f:
    nearby_words = json.load(f)
  stemmer = LancasterStemmer()
  vocabs = {}
  for a in dataset['data']:
    for p in a['paragraphs']:
      for q in p['qas']:
        q_words = [w.lower() for w in word_tokenize(q['question'])]
        if OPTS.mode == 'basic':
          vocabs[q['id']] = (basic_vocab, [])
        elif OPTS.mode == 'add-question-words':
          vocabs[q['id']] = (basic_vocab, q_words)
        elif OPTS.mode.endswith('-nearby'):
          q_stems = [stemmer.stem(qw) for qw in q_words]
          cur_vocab = [w for w in basic_vocab if w not in q_stems]
          cur_nearby = []
          for q_word, q_stem in zip(q_words, q_stems):
            if q_word in nearby_words:
              qw_nearby = []
              for nearby_word in nearby_words[q_word]:
                if len(qw_nearby) == OPTS.num_nearby: break
                if nearby_word['word'] in PUNCTUATION: continue
                nearby_stem = stemmer.stem(nearby_word['word'])
                if nearby_stem != q_stem:
                  qw_nearby.append(nearby_word['word'])
              cur_nearby.extend(qw_nearby)
          vocabs[q['id']] = (cur_vocab, cur_nearby)
  return vocabs