python source code of TextRank

# TextRank, based on:
# http://web.eecs.umich.edu/~mihalcea/papers/mihalcea.emnlp04.pdf

from itertools import tee, izip
from nltk import stem
from text.blob import TextBlob as tb
from textblob_aptagger import PerceptronTagger
import nltk.data
import numpy as np
import sys


TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
TAGGER = PerceptronTagger()
STEMMER = stem.porter.PorterStemmer()


def pos_tag (s):
  """high-performance part-of-speech tagger"""
  global TAGGER
  return TAGGER.tag(s)


def wrap_words (pair):
  """wrap each (word, tag) pair as an object with fully indexed metadata"""
  global STEMMER
  index = pair[0]
  result = []
  for word, tag in pair[1]:
    word = word.lower()
    stem = STEMMER.stem(word)
    if stem == "":
      stem = word
    keep = tag in ('JJ', 'NN', 'NNS', 'NNP',)
    result.append({ "id": 0, "index": index, "stem": stem, "word": word, "tag": tag, "keep": keep })
    index += 1
  return result


######################################################################
## build a graph from raw text

TEXT = """
Compatibility of systems of linear constraints over the set of natural numbers. 
Criteria of compatibility of a system of linear Diophantine equations, strict
inequations, and nonstrict inequations are considered. Upper bounds for
components of a minimal set of solutions and algorithms of construction of
minimal generating sets of solutions for all types of systems are given. 
These criteria and the corresponding algorithms for constructing a minimal
supporting set of solutions can be used in solving all the considered types
systems and systems of mixed types.
"""

from pyspark import SparkContext
sc = SparkContext(appName="TextRank", master="local[*]")

sent = sc.parallelize(TOKENIZER.tokenize(TEXT)).map(pos_tag).cache()
sent.collect()

base = list(np.cumsum(np.array(sent.map(len).collect())))
base.insert(0, 0)
base.pop()
sent_length = sc.parallelize(base)
 
tagged_doc = sent_length.zip(sent).map(wrap_words)


######################################################################

from pyspark.sql import SQLContext, Row
sqlCtx = SQLContext(sc)

def enum_words (s):
  for word in s:
    yield word

words = tagged_doc.flatMap(enum_words)
pair_words = words.keyBy(lambda w: w["stem"])
uniq_words = words.map(lambda w: w["stem"]).distinct().zipWithUniqueId()

uniq = sc.broadcast(dict(uniq_words.collect()))


def id_words (pair):
  (key, val) = pair
  word = val[0]
  id = val[1]
  word["id"] = id
  return word

id_doc = pair_words.join(uniq_words).map(id_words)
id_words = id_doc.map(lambda w: (w["id"], w["index"], w["word"], w["stem"], w["tag"]))

wordSchema = id_words.map(lambda p: Row(id=long(p[0]), index=int(p[1]), word=p[2], stem=p[3], tag=p[4]))
wordTable = sqlCtx.inferSchema(wordSchema)

wordTable.registerTempTable("word")
wordTable.saveAsParquetFile("word.parquet")


######################################################################

def sliding_window (iterable, size):
  """apply a sliding window to produce 'size' tiles"""
  iters = tee(iterable, size)
  for i in xrange(1, size):
    for each in iters[i:]:
      next(each, None)
  return list(izip(*iters))


def keep_pair (pair):
  """filter the relevant linked word pairs"""
  return pair[0]["keep"] and pair[1]["keep"] and (pair[0]["word"] != pair[1]["word"])


def link_words (seq):
  """attempt to link words in a sentence"""
  return [ (seq[0], word) for word in seq[1:] ]


tiled = tagged_doc.flatMap(lambda s: sliding_window(s, 3)).flatMap(link_words).filter(keep_pair)

t0 = tiled.map(lambda l: (uniq.value[l[0]["stem"]], uniq.value[l[1]["stem"]],))
t1 = tiled.map(lambda l: (uniq.value[l[1]["stem"]], uniq.value[l[0]["stem"]],))

neighbors = t0.union(t1)

edgeSchema = neighbors.map(lambda p: Row(n0=long(p[0]), n1=long(p[1])))
edgeTable = sqlCtx.inferSchema(edgeSchema)

edgeTable.registerTempTable("edge")
edgeTable.saveAsParquetFile("edge.parquet")