########################################################################## # Copyright 2018 Kata.ai # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ########################################################################## from collections import defaultdict from typing import Collection, Dict, List, Mapping, Optional, Set, Sequence, Tuple import math from nltk.classify import BinaryMaxentFeatureEncoding, MaxentClassifier, NaiveBayesClassifier from nltk.probability import ConditionalFreqDist, ConditionalProbDist, ConditionalProbDistI, \ FreqDist, LidstoneProbDist, ProbDistI from scipy.special import logsumexp from scipy.stats import multivariate_normal import numpy as np from data import Document, Sentence, Word from models import AbstractSummarizer class HMMSummarizer(AbstractSummarizer): """Summarizer using hidden Markov model (Conroy and O'Leary, 2001). In this formulation of HMM, the initial and transition probability are multinomial whereas the emission probability is Gaussian. The Gaussian mean is estimated for every possible tag while the covariance matrix is estimated using the whole samples. In other words, the covariance matrix is shared. There is a difference from the original paper: we do not use QR decomposition for sentence selection. Args: init_pdist (nltk.probability.ProbDistI): Initial state probability. trans_pdist (nltk.probability.ConditionalProbDistI): Transition probability. emit_pdist (nltk.probability.ConditionalProbDistI): Emission probability. states (Sequence[int]): A sequence of possible states. gamma (float): Smoothing value for the "word probability in a document" feature. tf_table (Mapping[Word, float]): A precomputed term-frequency table that is already normalized. """ def __init__(self, init_pdist: ProbDistI, trans_pdist: ConditionalProbDistI, emit_pdist: ConditionalProbDistI, states: Sequence[int], gamma: float = 0.1, tf_table: Optional[Mapping[Word, float]] = None, ) -> None: self.init_pdist = init_pdist self.trans_pdist = trans_pdist self.emit_pdist = emit_pdist self.states = states self.gamma = gamma self.tf_table = tf_table self._start_transitions = None self._transitions = None @classmethod def train(cls, docs: Collection[Document], gamma_word: float = 0.1, gamma_init: float = 0.1, gamma_trans: float = 0.1, tf_table: Optional[Mapping[Word, float]] = None, ) -> 'HMMSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. gamma_word (float): Smoothing value for the "word probability in a document" feature. gamma_init (float): Smoothing value for the initial probability. gamma_trans (float): Smoothing value for the transition probability. tf_table (Mapping[Word, float]): A precomputed term-frequency table that is already normalized. Returns: HMM: The trained model. """ init_fdist = FreqDist() trans_fdist = ConditionalFreqDist() tagged_vecs: list = [] states = set() for doc in docs: tags = cls._get_tags(doc.sentences) if not tags: continue init_fdist[tags[0]] += 1 for prev, tag in zip(tags, tags[1:]): trans_fdist[prev][tag] += 1 vecs = cls._get_feature_vectors(doc, gamma_word, tf=tf_table) tagged_vecs.extend(zip(vecs, tags)) states.update(tags) # Initial probability init_pdist = LidstoneProbDist(init_fdist, gamma_init, bins=len(states)) # Transition probability trans_pdist = ConditionalProbDist( trans_fdist, LidstoneProbDist, gamma_trans, bins=len(states)) # Emission probability emit_pdist = _GaussianEmission.train(tagged_vecs) return cls( init_pdist, trans_pdist, emit_pdist, list(states), gamma=gamma_word, tf_table=tf_table) def summarize(self, doc: Document, size: int) -> List[int]: """Summarize a given document. Args: doc (Document): The document to summarize. size (int): Maximum number of sentences that the summary should have. Returns: list: The indices of the extracted sentences that form the summary, sorted ascending. """ size = min(size, len(doc.sentences)) vecs = self._get_feature_vectors(doc, self.gamma, tf=self.tf_table) gamma = self._compute_gamma(vecs) summ_states = [i for i, st in enumerate(self.states) if st % 2 == 0] scores = gamma[:, summ_states].sum(axis=1) summary = sorted( range(len(doc.sentences)), key=lambda k: scores[k], reverse=True)[:size] return summary @classmethod def _get_tags(cls, sents: Sequence[Sentence]) -> List[int]: if not sents: return [] tags = [2 if sents[0].label else 1] for sent in sents[1:]: if tags[-1] % 2: next_tag = tags[-1] + (1 if sent.label else 0) else: next_tag = tags[-1] + (2 if sent.label else 1) tags.append(next_tag) return tags @classmethod def _get_feature_vectors(cls, doc: Document, gamma: float, tf: Optional[Mapping[Word, float]] = None, ) -> List[np.ndarray]: word_fdist = FreqDist(doc.words) word_pdist = LidstoneProbDist(word_fdist, gamma) vecs = [] for para in doc: for i, sent in enumerate(para): vec = [] # Sentence position in paragraph if i == 0: vec.append(1.) elif i == len(para) - 1: vec.append(2. if len(para) == 2 else 3.) else: vec.append(2.) # Number of terms vec.append(math.log(len(sent) + 1)) # Probability of terms in document vec.append(sum(math.log(word_pdist.prob(w)) for w in sent)) # Probability of terms in a baseline document if tf is not None: vec.append(sum(math.log(tf[w]) for w in sent if w in tf)) vecs.append(np.array(vec)) return vecs def _build_transitions(self): if self._start_transitions is None: self._start_transitions = np.log( np.array([self.init_pdist.prob(st) for st in self.states])) if self._transitions is None: n = len(self.states) self._transitions = np.zeros((n, n)) for i, st_i in enumerate(self.states): for j, st_j in enumerate(self.states): try: pdist = self.trans_pdist[st_i] except ValueError: # state st_i never occurred in training data self._transitions[i, j] = -np.inf else: self._transitions[i, j] = np.log(pdist.prob(st_j)) def _forward(self, obs: Sequence[np.ndarray]) -> np.ndarray: assert all(ob.ndim == 1 and ob.shape[0] == self.emit_pdist.ndim for ob in obs) self._build_transitions() if not obs: assert self._start_transitions is not None return self._start_transitions.reshape(1, -1) m, n = len(obs), len(self.states) # Build emission matrix emissions = np.zeros((m, n)) for i, ob in enumerate(obs): for j, st in enumerate(self.states): emissions[i, j] = np.log(self.emit_pdist[st].prob(ob)) alpha = np.zeros((m, n)) # shape: (n,) alpha[0] = self._start_transitions + emissions[0] for t in range(1, m): # shape: (n, 1) a = alpha[t - 1].reshape(-1, 1) # shape: (1, n) e = emissions[t].reshape(1, -1) # shape: (n, n) s = a + self._transitions + e # shape: (n,) alpha[t] = logsumexp(s, axis=0) return alpha def _backward(self, obs: Sequence[np.ndarray]) -> np.ndarray: assert all(ob.ndim == 1 and ob.shape[0] == self.emit_pdist.ndim for ob in obs) if not obs: return np.zeros((1, len(self.states))) self._build_transitions() m, n = len(obs), len(self.states) # Build emission matrix emissions = np.zeros((m, n)) for i, ob in enumerate(obs): for j, st in enumerate(self.states): emissions[i, j] = np.log(self.emit_pdist[st].prob(ob)) beta = np.zeros((m, n)) for t in range(m - 2, -1, -1): # shape: (1, n) b = beta[t + 1].reshape(1, -1) # shape: (1, n) e = emissions[t + 1].reshape(1, -1) # shape: (n, n) s = self._transitions + e + b # shape: (n,) beta[t] = logsumexp(s, axis=0) return beta def _compute_gamma(self, obs: Sequence[np.ndarray]) -> np.ndarray: alpha = self._forward(obs) beta = self._backward(obs) omega = logsumexp(alpha[-1]) return alpha + beta - omega class MaxentSummarizer(AbstractSummarizer): """Summarizer using maximum entropy classifier (Osborne, 2002). There is a difference from the original paper: we put Gaussian prior on the classifier weights while the original paper puts the prior on the class labels distribution. This difference is fine because our classifier has a bias feature which is able to capture the prior class labels distribution from the training data. Args: classifier (nltk.classify.MaxentClassifier): The underlying classifier object used. stopwords (Collection[Word]): Collection of stopwords. word_pairs (Collection[Tuple[Word, Word]]): Collection of word pairs, where a word pair is defined as two consecutive words found in a sentence. trim_length (int): Trim words to this length (measured by the number of characters). Attributes: STOPWORD_TOKEN (str): Special token for stopwords. Stopwords will be converted into this token beforehand. """ STOPWORD_TOKEN = '<stopword>' def __init__(self, classifier: MaxentClassifier, stopwords: Optional[Collection[Word]] = None, word_pairs: Optional[Collection[Tuple[Word, Word]]] = None, trim_length: int = 10, ) -> None: if stopwords is None: stopwords = set() if word_pairs is None: word_pairs = set() self.classifier = classifier self.stopwords = stopwords self.word_pairs = word_pairs self.trim_length = trim_length @classmethod def train(cls, docs: Collection[Document], stopwords: Optional[Collection[Word]] = None, algorithm: str = 'iis', cutoff: int = 4, sigma: float = 0., trim_length: int = 10, ) -> 'MaxentSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. stopwords (Collection[Word]): Collection of stopwords. algorithm (str): Optimization algorithm for training. Possible values are 'iis', 'gis', or 'megam' (requires `megam`_ to be installed). cutoff (int): Features that occur fewer than this value in the training data will be discarded. sigma (float): Standard deviation for the Gaussian prior. Default is no prior. trim_length (int): Trim words to this length. Returns: MaxEntropy: The trained model. .. _megam: https://www.umiacs.umd.edu/~hal/megam/ """ if stopwords is None: stopwords = set() word_pairs = {pair for doc in docs for sent in doc.sentences for pair in cls._get_word_pairs(sent, stopwords, trim_len=trim_length)} train_data: list = [] for doc in docs: featuresets = cls._extract_featuresets(doc, stopwords, word_pairs, trim_length) labels = [sent.label for sent in doc.sentences] train_data.extend(zip(featuresets, labels)) encoding = BinaryMaxentFeatureEncoding.train( train_data, count_cutoff=cutoff, alwayson_features=True) classifier = MaxentClassifier.train( train_data, algorithm=algorithm, encoding=encoding, gaussian_prior_sigma=sigma) return cls(classifier, stopwords=stopwords, word_pairs=word_pairs) def summarize(self, doc: Document, size: int = 4) -> List[int]: """Summarize a given document. Args: doc (Document): The document to summarize. size (int): Maximum number of sentences that the summary should have. Returns: list: The indices of the extracted sentences that form the summary, sorted ascending. """ size = min(size, len(doc.sentences)) featuresets = self._extract_featuresets( doc, self.stopwords, self.word_pairs, self.trim_length) summary = [k for k, fs in enumerate(featuresets) if self.classifier.classify(fs)][:size] return summary @classmethod def _extract_featuresets(cls, doc: Document, stopwords: Collection[Word], word_pairs: Collection[Tuple[Word, Word]], trim_length: int, ) -> List[dict]: featuresets = [] for i, para in enumerate(doc): for j, sent in enumerate(para): fs: dict = {} # Word pair for pair in cls._get_word_pairs(sent, stopwords, trim_length): if pair in word_pairs: fs[f'has-pair({pair[0]},{pair[1]})'] = True # Sentence length if len(sent) < 6: fs['length'] = 'short' elif len(sent) > 20: fs['length'] = 'long' # Previous sentence length if j > 0 and len(para[j - 1]) < 5: fs['prev-length<5'] = True # Sentence position if i < 8: fs['pos-para'] = 'first' elif i >= len(doc) - 3: fs['pos-para'] = 'last' # Limited discourse feat if i == 0: fs['para-start'] = True featuresets.append(fs) return featuresets @classmethod def _get_word_pairs(cls, sent: Sentence, stopwords: Collection[Word], trim_len: int, ) -> Set[Tuple[Word, Word]]: words = [cls.STOPWORD_TOKEN if word in stopwords else word[:trim_len] for word in sent] return {(w1, w2) for w1, w2 in zip(words, words[1:]) if w1 != cls.STOPWORD_TOKEN and w2 != cls.STOPWORD_TOKEN} class NaiveBayesSummarizer(AbstractSummarizer): """Summarizer using naive Bayes method (Aone et al., 1998). There is a difference from the original paper: when computing TF-IDF, we operate on word token level, while the original paper operates on multi-word tokens that are discovered using mutual information. Args: classifier (nltk.classify.NaiveBayesClassifier): The underlying classifier object used. signature_words (Collection[Word]): Collection of words that are deemed important. """ def __init__(self, classifier: NaiveBayesClassifier, signature_words: Optional[Collection[Word]] = None, ) -> None: if signature_words is None: signature_words = set() self.classifier = classifier self.signature_words = signature_words @classmethod def train(cls, docs: Collection[Document], cutoff: float = 0.1, idf_table: Optional[Mapping[Word, float]] = None, ) -> 'NaiveBayesSummarizer': """Train the model on a collection of documents. Args: docs (Collection[Document]): The collection of documents to train on. cutoff (float): Cutoff for signature words. idf_table (Mapping[Word, float]): Precomputed IDF table. If not given, the IDF will be computed from ``docs``. Returns: NaiveBayes: The trained model. """ # Find signature words idf = cls._compute_idf(docs) if idf_table is None else idf_table n_cutoff = int(cutoff * len(idf)) signature_words = set(sorted( idf.keys(), key=lambda w: idf[w], reverse=True)[:n_cutoff]) train_data = [] # type: list for doc in docs: featuresets = cls._extract_featuresets(doc, signature_words) labels = [sent.label for sent in doc.sentences] train_data.extend(zip(featuresets, labels)) return cls( NaiveBayesClassifier.train(train_data), signature_words=signature_words) def summarize(self, doc: Document, size: int = 3) -> List[int]: """Summarize a given document. Args: doc (Document): The document to summarize. size (int): Maximum number of sentences that the summary should have. Returns: list: The indices of the extracted sentences that form the summary, sorted ascending. """ size = min(size, len(doc.sentences)) featuresets = self._extract_featuresets(doc, self.signature_words) summary = [k for k, fs in enumerate(featuresets) if self.classifier.classify(fs)][:size] return summary @classmethod def _extract_featuresets(cls, doc: Document, signature_words: Collection[Word], ) -> List[dict]: n_sents = len(doc.sentences) featuresets = [] sent_pos = 0 for para in doc: for i, sent in enumerate(para): fs: dict = { # A short sentence 'short': len(sent) < 5, # Has signature word 'has-signature-word': any(w in signature_words for w in sent), } # Position in document fs['pos-doc'] = math.floor(4 * sent_pos / n_sents) # Position in paragraph fs['pos-para'] = math.floor(3 * i / len(para)) featuresets.append(fs) sent_pos += 1 return featuresets @staticmethod def _compute_idf(docs: Collection[Document]) -> Dict[Word, float]: freq_table = defaultdict(int) # type: ignore for doc in docs: for word in set(doc.words): freq_table[word] += 1 return {word: math.log(len(docs) / freq_table[word]) for word in freq_table} class _Gaussian(ProbDistI): def __init__(self, mean: np.ndarray, cov: np.ndarray) -> None: self.mean = mean self.cov = cov self._rv = multivariate_normal(mean=mean, cov=cov) def prob(self, sample: np.ndarray) -> float: return self._rv.pdf(sample) def max(self) -> np.ndarray: return self.mean def samples(self) -> list: raise NotImplementedError( 'all samples have non-zero probability in Gaussian distribution') class _GaussianEmission(ConditionalProbDistI): def __init__(self, mean_dict: Dict[int, np.ndarray], cov: np.ndarray) -> None: self.mean_dict = mean_dict self.cov = cov self.update({tag: _Gaussian(mean, cov) for tag, mean in mean_dict.items()}) @property def ndim(self) -> int: return len(self.cov) @classmethod def train(cls, tagged_vecs: Collection[Tuple[np.ndarray, int]]) -> '_GaussianEmission': by_tag: dict = defaultdict(list) for vec, tag in tagged_vecs: by_tag[tag].append(vec) mean_dict = {} matrices = [] for tag, vecs in by_tag.items(): mean = mean_dict[tag] = np.mean(vecs, axis=0) for vec in vecs: v = (vec - mean).reshape(-1, 1) matrices.append(v.dot(v.T)) cov = np.mean(matrices, axis=0) return cls(mean_dict, cov)