python source code of articles

"""Classes and functions to store aggregated term article data."""

import os
import json

import nltk

from lisc.utils.io import check_ext
from lisc.utils.db import check_directory
from lisc.data.utils import combine_lists, convert_string, count_elements
from lisc.data.base_articles import BaseArticles

###################################################################################################
###################################################################################################

class ArticlesAll(BaseArticles):
    """An object to hold term data, aggregated across articles.

    Attributes
    ----------
    label : str
        Label for the term.
    term : Term
        Definition of the search term, with inclusion and exclusion words.
    n_articles : int
        Number of articles included in object.
    ids : list of int
        Article ids for all articles included in object.
    journals : collections.Counter
        Counts for each journal.
    authors : collections.Counter
        Counts for each author.
    first_authors : collections.Counter
        Counts for each first author.
    last_authors : collections.Counter
        Counts for each last author.
    words : nltk.probability.FreqDist
        Frequency distribution of all words.
    keywords : nltk.probability.FreqDist
        Frequency distribution of all keywords.
    years : collections.Counter
        Counts for each year of publication.
    dois : list of str
        DOIs of each article included in object.
    summary : dict
        A summary of the data associated with the current object.
    """

    def __init__(self, term_data, exclusions=None):
        """Initialize ArticlesAll object.

        Parameters
        ----------
        term_data : Articles
            Data for all articles from a given search term.
        exclusions : list of str, optional
            Words to exclude from the word collections.

        Examples
        --------
        Create an ``ArticlesAll`` object from an :class:`~.Articles` object:

        >>> from lisc.data import Articles
        >>> articles = Articles('frontal lobe')
        >>> articles_all = ArticlesAll(articles)
        """

        # Inherit from the BaseArticles object
        BaseArticles.__init__(self, term_data.term)

        # Copy over tracking of included IDs & DOIs
        self.ids = term_data.ids
        self.dois = term_data.dois

        # Get counts of authors, journals, years
        self.journals = count_elements([journal[0] for journal in term_data.journals])
        self.years = count_elements(term_data.years)
        self.authors = _count_authors(term_data.authors)
        self.first_authors, self.last_authors = _count_end_authors(term_data.authors)

        # Convert lists of all words to frequency distributions
        exclusions = exclusions if exclusions else [] + self.term.search + self.term.inclusions
        temp_words = [convert_string(words) for words in term_data.words]
        self.words = self.create_freq_dist(combine_lists(temp_words), exclusions)
        self.keywords = self.create_freq_dist(combine_lists(term_data.keywords), exclusions)

        # Initialize summary dictionary
        self.summary = dict()


    def check_frequencies(self, data_type='words', n_check=20):
        """Prints out the most common items in a frequency distribution.

        Parameters
        ----------
        data_type : {'words', 'keywords'}
            Which frequency distribution to check.
        n_check : int, optional, default: 20
            Number of most common items to print out.

        Examples
        --------
        Print the most frequent words, assuming an initialized ``ArticlesAll`` object with data:

        >>> articles_all.check_frequencies() # doctest:+SKIP
        """

        if data_type in ['words', 'keywords']:
            freqs = getattr(self, data_type)
        else:
            raise ValueError('Requested data not understood')

        # Reset number to check if there are fewer words available
        if n_check > len(freqs):
            n_check = len(freqs)

        # Get the requested number of most common words, and convert to str
        top = freqs.most_common()[0:n_check]
        top_str = ' , '.join([word[0] for word in top[:n_check]])

        # Print out the top words for the current term
        print("{:5} : ".format(self.label) + top_str)


    def create_summary(self):
        """Fill the summary dictionary of the current terms Words data.

        Examples
        --------
        Create a summary for a term, assuming an initialized ``ArticlesAll`` object with collected
        ``Words`` data:

        >>> articles_all.create_summary() # doctest:+SKIP
        """

        # Add data to summary dictionary.
        self.summary['label'] = self.label
        self.summary['n_articles'] = str(self.n_articles)
        self.summary['top_author_name'] = ' '.join(self.authors.most_common()[0][0])
        self.summary['top_author_count'] = str(self.authors.most_common()[0][1])
        self.summary['top_journal_name'] = self.journals.most_common()[0][0]
        self.summary['top_journal_count'] = str(self.journals.most_common()[0][1])
        self.summary['top_keywords'] = [freq[0] for freq in self.keywords.most_common()[0:5]]
        self.summary['first_publication'] = str(min(self.years.keys()))


    def print_summary(self):
        """Print out a summary of the collected words data.

        Examples
        --------
        Print a summary for a term, assuming an initialized ``ArticlesAll`` object with data:

        >>> articles_all.create_summary() # doctest:+SKIP
        >>> articles_all.print_summary() # doctest:+SKIP
        """

        # Print out summary information
        print(self.summary['label'], ':')
        print('  Number of articles: \t\t', self.summary['n_articles'])
        print('  First publication: \t\t', self.summary['first_publication'])
        print('  Most common author: \t\t', self.summary['top_author_name'])
        print('    number of publications: \t', self.summary['top_author_count'])
        print('  Most common journal: \t\t', self.summary['top_journal_name'])
        print('    number of publications: \t', self.summary['top_journal_count'], '\n')


    def save_summary(self, directory=None):
        """Save out a summary of the collected words data.

        Parameters
        ----------
        directory : str or SCDB or None, optional
            Folder or database object specifying the save location.

        Examples
        --------
        Save a summary for a term, assuming an initialized ``ArticlesAll`` object with data::

        >>> articles_all.create_summary() # doctest:+SKIP
        >>> articles_all.save_summary() # doctest:+SKIP
        """

        directory = check_directory(directory, 'summary')

        with open(os.path.join(directory, check_ext(self.label, '.json')), 'w') as outfile:
            json.dump(self.summary, outfile)


    @staticmethod
    def create_freq_dist(in_lst, exclude):
        """Create a frequency distribution.

        Parameters
        ----------
        in_lst : list of str
            Words to create the frequency distribution from.
        exclude : list of str
            Words to exclude from the frequency distribution.

        Returns
        -------
        freqs : nltk.FreqDist
            Frequency distribution of the words.

        Examples
        --------
        Compute the frequency distribution of a collection of words:

        >>> ArticlesAll.create_freq_dist(in_lst=['brain', 'brain', 'head', 'body'], exclude=['body'])
        FreqDist({'brain': 2, 'head': 1})

        If you want to visualize a frequency distribution, you can plot them as a wordcloud:

        >>> from lisc.plts.words import plot_wordcloud
        >>> freq_dist = nltk.FreqDist({'frontal': 26, 'brain': 26, 'lobe': 23, 'patients': 19})
        >>> plot_wordcloud(freq_dist, len(freq_dist))
        """

        freqs = nltk.FreqDist(in_lst)

        for excl in exclude:
            try:
                freqs.pop(excl.lower())
            except KeyError:
                pass

        return freqs


def _count_authors(authors):
    """Count all authors.

    Parameters
    ----------
    authors : list of list of tuple of (str, str, str, str)
        Authors, as (last name, first name, initials, affiliation).

    Returns
    -------
    author_counts : collections.Counter
        Number of publications per author.
    """

    # Reduce author fields to pair of tuples (last name, initials) & count # of pubs per author
    all_authors = [(author[0], author[2]) for art_authors in authors for author in art_authors]
    author_counts = count_elements(_fix_author_names(all_authors))

    return author_counts


def _count_end_authors(authors):
    """Count first and last authors only.

    Parameters
    ----------
    authors : list of list of tuple of (str, str, str, str)
        Authors, as (last name, first name, initials, affiliation).

    Returns
    -------
    first_counts, last_counts : collections.Counter
        Number of publications for each first and last author.
    """

    # Pull out the full name for the first & last author of each article
    #  Last author is only considered if there is more than 1 author
    firsts = [auth[0] for auth in authors]
    f_names = [(author[0], author[2]) for author in firsts]

    lasts = [auth[-1] for auth in authors if len(auth) > 1]
    l_names = [(author[0], author[2]) for author in lasts]

    f_counts = count_elements(_fix_author_names(f_names))
    l_counts = count_elements(_fix_author_names(l_names))

    return f_counts, l_counts


def _fix_author_names(names):
    """Fix author names.

    Parameters
    ----------
    names : list of tuple of (str, str)
        Author names, as (last name, initials).

    Returns
    -------
    names : list of tuple of (str, str)
        Author names, as (last name, initials).

    Notes
    -----
    Sometimes full author name ends up in the last name field.
    If first name is None, assume this happened:
    Split up the text in first name, and grab the first name initial.
    """

    # Drop names where the contents is all None
    names = [name for name in names if name != (None, None)]

    # Fix names if full name ended up in last name field
    names = [(name[0].split(' ')[-1], ''.join([temp[0] for temp in name[0].split(' ')[:-1]]))
             if name[1] is None else name for name in names]

    return names