python source code of dcs

"""
Disambiguated Core Semantics (DCS) [EXPERIMENTAL]
----------------------------------

As described in:

    Tingting Wei, Yonghe Lu, Huiyou Chang, Qiang Zhou, Xianyu Bao (2015).
    A semantic approach for text clustering using WordNet and lexical chains.
    <http://www.sciencedirect.com/science/article/pii/S0957417414006472>

Overview:

    DCS disambiguates word senses (for nouns, verbs, adverbs, and adjectives) by
    selecting the word sense most appropriate given the other word senses in a document,
    "most appropriate" based on semantic similarity (calculated using a combination of
    Wu-Palmer similarity and a novel "implicit" semantic similarity, see code for details).

    Lexical chains are constructed out of these senses ("concepts"), which are then scored
    according to the frequencies of a chain's concepts in each document. The n highest-scoring lexical
    chains are selected to represent the document.

    The aggregate set of qualifying concepts for all documents defines the feature space
    for vector representations.

    Here "concept" is synonymous with "synset".
    "Concept" is how the paper refers to synsets, so that's used here.

Limitations:

    - some words may not be in WordNet, e.g. entities or neologisms
    - some semantic relationships are not represented in WordNet
    - there's a lot going on here, it can be quite slow

Notes:

    - This code is more or less a sketch, it could be optimized ~
    - I have had limited success using this representation for clustering

- Francis
"""

import math
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from scipy.sparse.csgraph import connected_components
from broca.distance.sift4 import sift4
from broca.vectorize import Vectorizer
from broca.common.util import penn_to_wordnet
from broca.common.shared import spacy


stops = stopwords.words('english')


class DCSVectorizer(Vectorizer):
    def __init__(self, alpha=1.5, relation_weights=[0.8, 0.5, 0.3], n_chains=10):
        self.alpha = 1.5
        self.relation_weights = relation_weights
        self.n_chains = n_chains

        # Cache concept => description
        # and (c1, c2) => similarity
        self.descriptions = {}
        self.concept_sims = {}


    def vectorize(self, docs):
        """
        Vectorizes a list of documents using their DCS representations.
        """
        doc_core_sems, all_concepts = self._extract_core_semantics(docs)

        shape = (len(docs), len(all_concepts))
        vecs = np.zeros(shape)
        for i, core_sems in enumerate(doc_core_sems):
            for con, weight in core_sems:
                j = all_concepts.index(con)
                vecs[i,j] = weight

        # Normalize
        return vecs/np.max(vecs)


    def _process_doc(self, doc):
        """
        Applies DCS to a document to extract its core concepts and their weights.
        """
        # Prep
        doc = doc.lower()
        tagged_tokens = [(t, penn_to_wordnet(t.tag_)) for t in spacy(doc, tag=True, parse=False, entity=False)]
        tokens = [t for t, tag in tagged_tokens]
        term_concept_map = self._disambiguate_doc(tagged_tokens)
        concept_weights = self._weight_concepts(tokens, term_concept_map)

        # Compute core semantics
        lexical_chains = self._lexical_chains(doc, term_concept_map)
        core_semantics = self._core_semantics(lexical_chains, concept_weights)
        core_concepts = [c for chain in core_semantics for c in chain]

        return [(con, concept_weights[con]) for con in core_concepts]


    def _disambiguate_doc(self, tagged_tokens):
        """
        Takes a list of tagged tokens, representing a document,
        in the form:

            [(token, tag), ...]

        And returns a mapping of terms to their disambiguated concepts (synsets).
        """

        # Group tokens by PoS
        pos_groups = {pos: [] for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]}
        for tok, tag in tagged_tokens:
            if tag in pos_groups:
                pos_groups[tag].append(tok)

        #print(pos_groups)

        # Map of final term -> concept mappings
        map = {}
        for tag, toks in pos_groups.items():
            map.update(self._disambiguate_pos(toks, tag))

        #nice_map = {k: map[k].lemma_names() for k in map.keys()}
        #print(json.dumps(nice_map, indent=4, sort_keys=True))

        return map


    def _disambiguate_pos(self, terms, pos):
        """
        Disambiguates a list of tokens of a given PoS.
        """
        # Map the terms to candidate concepts
        # Consider only the top 3 most common senses
        candidate_map = {term: wn.synsets(term, pos=pos)[:3] for term in terms}

        # Filter to unique concepts
        concepts = set(c for cons in candidate_map.values() for c in cons)

        # Back to list for consistent ordering
        concepts = list(concepts)
        sim_mat = self._similarity_matrix(concepts)

        # Final map of terms to their disambiguated concepts
        map = {}

        # This is terrible
        # For each term, select the candidate concept
        # which has the maximum aggregate similarity score against
        # all other candidate concepts of all other terms sharing the same PoS
        for term, cons in candidate_map.items():
            # Some words may not be in WordNet
            # and thus have no candidate concepts, so skip
            if not cons:
                continue
            scores = []
            for con in cons:
                i = concepts.index(con)
                scores_ = []
                for term_, cons_ in candidate_map.items():
                    # Some words may not be in WordNet
                    # and thus have no candidate concepts, so skip
                    if term == term_ or not cons_:
                        continue
                    cons_idx = [concepts.index(c) for c in cons_]
                    top_sim = max(sim_mat[i,cons_idx])
                    scores_.append(top_sim)
                scores.append(sum(scores_))
            best_idx = np.argmax(scores)
            map[term] = cons[best_idx]

        return map


    def _similarity_matrix(self, concepts):
        """
        Computes a semantic similarity matrix for a set of concepts.
        """
        n_cons = len(concepts)
        sim_mat = np.zeros((n_cons, n_cons))
        for i, c1 in enumerate(concepts):
            for j, c2 in enumerate(concepts):
                # Just build the lower triangle
                if i >= j:
                    sim_mat[i,j] = self._semsim(c1, c2) if i != j else 1.
        return sim_mat + sim_mat.T - np.diag(sim_mat.diagonal())


    def _semsim(self, c1, c2):
        """
        Computes the semantic similarity between two concepts.

        The semantic similarity is a combination of two sem sims:

            1. An "explicit" sem sim metric, that is, one which is directly
            encoded in the WordNet graph. Here it is just Wu-Palmer similarity.

            2. An "implicit" sem sim metric. See `_imp_semsim`.

        Note we can't use the NLTK Wu-Palmer similarity implementation because we need to
        incorporate the implicit sem sim, but it's fairly straightforward --
        leaning on <http://www.nltk.org/_modules/nltk/corpus/reader/wordnet.html#Synset.wup_similarity>,
        see that for more info. Though...the formula in the paper includes an extra term in the denominator,
        which is wrong, so we leave it out.
        """
        if c1 == c2:
            return 1.

        if (c1, c2) in self.concept_sims:
            return self.concept_sims[(c1, c2)]

        elif (c2, c1) in self.concept_sims:
            return self.concept_sims[(c2, c1)]

        else:
            need_root = c1._needs_root()
            subsumers = c1.lowest_common_hypernyms(c2, simulate_root=need_root)

            if not subsumers:
                # For relationships not in WordNet, fallback on just implicit semsim.
                return self._imp_semsim(c1, c2)

            subsumer = subsumers[0]
            depth = subsumer.max_depth() + 1
            len1 = c1.shortest_path_distance(subsumer, simulate_root=need_root)
            len2 = c2.shortest_path_distance(subsumer, simulate_root=need_root)

            if len1 is None or len2 is None:
                # See above
                return self._imp_semsim(c1, c2)

            len1 += depth
            len2 += depth

            imp_score = self._imp_semsim(c1, c2)

            sim = (2.*depth + imp_score)/(len1 + len2 + imp_score)
            self.concept_sims[(c1, c2)] = sim
            return sim


    def _imp_semsim(self, c1, c2):
        """
        The paper's implicit semantic similarity metric
        involves iteratively computing string overlaps;
        this is a modification where we instead use
        inverse Sift4 distance (a fast approximation of Levenshtein distance).

        Frankly ~ I don't know if this is an appropriate
        substitute, so I'll have to play around with this and see.
        """

        desc1 = self._description(c1)
        desc2 = self._description(c2)

        raw_sim = 1/(sift4(desc1, desc2) + 1)
        return math.log(raw_sim + 1)


    def _core_semantics(self, lex_chains, concept_weights):
        """
        Returns the n representative lexical chains for a document.
        """
        chain_scores = [self._score_chain(lex_chain, adj_submat, concept_weights) for lex_chain, adj_submat in lex_chains]
        scored_chains = zip(lex_chains, chain_scores)
        scored_chains = sorted(scored_chains, key=lambda x: x[1], reverse=True)

        thresh = (self.alpha/len(lex_chains)) * sum(chain_scores)
        return [chain for (chain, adj_mat), score in scored_chains if score >= thresh][:self.n_chains]


    def _extract_core_semantics(self, docs):
        """
        Extracts core semantics for a list of documents, returning them along with
        a list of all the concepts represented.
        """
        all_concepts = []
        doc_core_sems = []
        for doc in docs:
            core_sems = self._process_doc(doc)
            doc_core_sems.append(core_sems)
            all_concepts += [con for con, weight in core_sems]
        return doc_core_sems, list(set(all_concepts))


    def _lexical_chains(self, doc, term_concept_map):
        """
        Builds lexical chains, as an adjacency matrix,
        using a disambiguated term-concept map.
        """
        concepts = list({c for c in term_concept_map.values()})

        # Build an adjacency matrix for the graph
        # Using the encoding:
        # 1 = identity/synonymy, 2 = hypernymy/hyponymy, 3 = meronymy, 0 = no edge
        n_cons = len(concepts)
        adj_mat = np.zeros((n_cons, n_cons))

        for i, c in enumerate(concepts):
            # TO DO can only do i >= j since the graph is undirected
            for j, c_ in enumerate(concepts):
                edge = 0
                if c == c_:
                    edge = 1
                # TO DO when should simulate root be True?
                elif c_ in c._shortest_hypernym_paths(simulate_root=False).keys():
                    edge = 2
                elif c in c_._shortest_hypernym_paths(simulate_root=False).keys():
                    edge = 2
                elif c_ in c.member_meronyms() + c.part_meronyms() + c.substance_meronyms():
                    edge = 3
                elif c in c_.member_meronyms() + c_.part_meronyms() + c_.substance_meronyms():
                    edge = 3

                adj_mat[i,j] = edge

        # Group connected concepts by labels
        concept_labels = connected_components(adj_mat, directed=False)[1]
        lexical_chains = [([], []) for i in range(max(concept_labels) + 1)]
        for i, concept in enumerate(concepts):
            label = concept_labels[i]
            lexical_chains[label][0].append(concept)
            lexical_chains[label][1].append(i)

        # Return the lexical chains as (concept list, adjacency sub-matrix) tuples
        return [(chain, adj_mat[indices][:,indices]) for chain, indices in lexical_chains]


    def _score_chain(self, lexical_chain, adj_submat, concept_weights):
        """
        Computes the score for a lexical chain.
        """
        scores = []

        # Compute scores for concepts in the chain
        for i, c in enumerate(lexical_chain):
            score = concept_weights[c] * self.relation_weights[0]
            rel_scores = []
            for j, c_ in enumerate(lexical_chain):
                if adj_submat[i,j] == 2:
                    rel_scores.append(self.relation_weights[1] * concept_weights[c_])

                elif adj_submat[i,j] == 3:
                    rel_scores.append(self.relation_weights[2] * concept_weights[c_])

            scores.append(score + sum(rel_scores))

        # The chain's score is just the sum of its concepts' scores
        return sum(scores)


    def _weight_concepts(self, tokens, term_concept_map):
        """
        Calculates weights for concepts in a document.

        This is just the frequency of terms which map to a concept.
        """

        weights = {c: 0 for c in term_concept_map.values()}
        for t in tokens:
            # Skip terms that aren't one of the PoS we used
            if t not in term_concept_map:
                continue
            con = term_concept_map[t]
            weights[con] += 1

        # TO DO paper doesn't mention normalizing these weights...should we?
        return weights


    def _description(self, concept):
        """
        Returns a "description" of a concept,
        as defined in the paper.

        The paper describes the description as a string,
        so this is a slight modification where we instead represent
        the definition as a list of tokens.
        """
        if concept not in self.descriptions:
            lemmas = concept.lemma_names()
            gloss = self._gloss(concept)
            glosses = [self._gloss(rel) for rel in self._related(concept)]
            raw_desc = ' '.join(lemmas + [gloss] + glosses)
            desc = [w for w in raw_desc.split() if w not in stops]
            self.descriptions[concept] = desc
        return self.descriptions[concept]


    def _gloss(self, concept):
        """
        The concatenation of a concept's definition and its examples.
        """
        return  ' '.join([concept.definition()] + concept.examples())


    def _related(self, concept):
        """
        Returns related concepts for a concept.
        """
        return concept.hypernyms() + \
                concept.hyponyms() + \
                concept.member_meronyms() + \
                concept.substance_meronyms() + \
                concept.part_meronyms() + \
                concept.member_holonyms() + \
                concept.substance_holonyms() + \
                concept.part_holonyms() + \
                concept.attributes() + \
                concept.also_sees() + \
                concept.similar_tos()