python source code of news_corpus

from goose import Goose
import feedparser
from pprint import pprint
import sys
import os
import json
import io
from collections import defaultdict
import sqlite3
import string
import urllib


class NewsCorpusGenerator(object):

    def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
        '''
        Read links and associated categories for specified articles 
        in text file seperated by a space

        Args:
            corpus_dir (str): The directory to save the generated corpus
            datastore_type (Optional[str]): Format to save generated corpus.
                                            Specify either 'file' or 'sqlite'.
            db_name (Optional[str]): Name of database if 'sqlite' is selected.
        '''

        self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
        #self.g = Goose({'browser_user_agent': 'Mozilla'})
        self.corpus_dir = corpus_dir
        self.datastore_type = datastore_type
        self.db_name = db_name
        self.stats = defaultdict(int)

        self._create_corpus_dir(self.corpus_dir)

        self.db = None
        if self.datastore_type == 'sqlite':
            self.db = self.corpus_dir + '/' + self.db_name
            self._set_up_db(self.db)

    def _create_corpus_dir(self,directory):

        if not os.path.exists(directory):
            os.makedirs(directory)


    def read_links_file(self,file_path):
        '''
        Read links and associated categories for specified articles 
        in text file seperated by a space

        Args:
            file_path (str): The path to text file with news article links
                             and category

        Returns:
            articles: Array of tuples that contains article link & cateogory
                      ex. [('IPO','www.cs.columbia.edu')]
        '''

        articles = []
        with open(file_path) as f:
            for line in f:
                line = line.strip()
                #Ignore blank lines
                if len(line) != 0:
                    link,category = line.split(' ')
                    articles.append((category.rstrip(),link.strip()))

        return articles

    def generate_corpus(self,articles):
        
        # TODO parallelize extraction process
        print 'Extracting  content from links...'

        for article in articles:
            category = article[0]
            link = article[1]

	    ex_article = None

	    try:
            	ex_article = self.g.extract(url=link)
	    except Exception:
		print('failed to extract article..')
		continue

            ex_title = ex_article.title
            ex_body = ex_article.cleaned_text
	    #print ex_title
            #print ex_body
            #sys.exit(1)

            if ex_body == '':
                self.stats['empty_body_articles'] += 1
                continue

            self._save_article({'title':ex_title, 'body': ex_body,
                'category':category})

    def _save_article(self,clean_article):

        print "Saving article %s..." %(clean_article['title'].encode("utf-8"))

        if self.datastore_type == 'file':
            self._save_flat_file(clean_article)
        elif self.datastore_type == 'sqlite':
            self._save_to_db(clean_article)
        else:
            raise Exception("Unsupported datastore type. Please specify file or sqlite")

    def _remove_punctuations(self,title):
        # TODO optimize for python 3
        #title.translate(title.maketrans("",""), string.punctuation)
        return "".join(char for char in title if char not in string.punctuation)

    def _save_flat_file(self,clean_article):

        directory = self.corpus_dir + '/' + clean_article['category']

        # create category directory
        if not os.path.exists(directory):
            os.makedirs(directory)

        file_name = directory + '/' + \
        self._remove_punctuations(clean_article['title']).replace(" ","_") + '.json'

        with io.open(file_name, 'w', encoding='utf-8') as f:
              f.write(unicode(json.dumps(clean_article, ensure_ascii=False)))

    def _encode_query(self,query):
        # TODO Python 3 urllib.parse.quote 
        return urllib.quote(query)


    def google_news_search(self,query,category_label,num=50):
        '''
        Searches Google News.
        NOTE: Official Google News API is deprecated https://developers.google.com/news-search/?hl=en
        NOTE: Google limits the maximum number of documents per query to 100.
              Use multiple related queries to get a bigger corpus. 

        Args:
            query (str): The search term.
            category_label (str): The category to assign to the articles. These
                                  categories are the labels in the generated corpus

            num (Optional[int]): The numnber of results to return.

        Returns:
            articles: Array of tuples that contains article link & cateogory
                      ex. [('IPO','www.cs.columbia.edu')]
        '''

        url = 'https://news.google.com/news?hl=en&q='+self._encode_query(query) \
                +'&num='+str(num)+'&output=rss'

        rss = feedparser.parse(url)
        entries = rss['entries']
        articles = []

        for entry in entries:
            link = entry['link']
            articles.append((category_label,link))
        return articles

    def _set_up_db(self,db):

        if os.path.exists(db):
            print 'Database exists, assume schema does, too.'
        else:
            print 'Need to create schema'
            print 'Creating schema...'
            conn = sqlite3.connect(db)
            cur = conn.cursor()
            cur.execute("create table articles (Id INTEGER PRIMARY KEY,category, title,body)")
            cur.execute("CREATE UNIQUE INDEX uni_article on articles (category,title)")
            conn.close()

    def _save_to_db(self,clean_article):
        
        conn = sqlite3.connect(self.db)
        with conn:
            cur = conn.cursor()   
            try:
                cur.execute("INSERT INTO articles (Id,category,title,body)\
                    VALUES(?, ?, ?,?)",(None,clean_article['category'],clean_article['title'],clean_article['body']))
            except sqlite3.IntegrityError:
                self.stats['not_insert_db'] += 1
                print 'Record already inserted with title %s ' %(clean_article['title'].encode("utf-8"))

    def get_stats(self):
        return self.stats


if __name__ == '__main__':
    pass