from goose import Goose import feedparser from pprint import pprint import sys import os import json import io from collections import defaultdict import sqlite3 import string import urllib class NewsCorpusGenerator(object): def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'): ''' Read links and associated categories for specified articles in text file seperated by a space Args: corpus_dir (str): The directory to save the generated corpus datastore_type (Optional[str]): Format to save generated corpus. Specify either 'file' or 'sqlite'. db_name (Optional[str]): Name of database if 'sqlite' is selected. ''' self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'}) #self.g = Goose({'browser_user_agent': 'Mozilla'}) self.corpus_dir = corpus_dir self.datastore_type = datastore_type self.db_name = db_name self.stats = defaultdict(int) self._create_corpus_dir(self.corpus_dir) self.db = None if self.datastore_type == 'sqlite': self.db = self.corpus_dir + '/' + self.db_name self._set_up_db(self.db) def _create_corpus_dir(self,directory): if not os.path.exists(directory): os.makedirs(directory) def read_links_file(self,file_path): ''' Read links and associated categories for specified articles in text file seperated by a space Args: file_path (str): The path to text file with news article links and category Returns: articles: Array of tuples that contains article link & cateogory ex. [('IPO','www.cs.columbia.edu')] ''' articles = [] with open(file_path) as f: for line in f: line = line.strip() #Ignore blank lines if len(line) != 0: link,category = line.split(' ') articles.append((category.rstrip(),link.strip())) return articles def generate_corpus(self,articles): # TODO parallelize extraction process print 'Extracting content from links...' for article in articles: category = article[0] link = article[1] ex_article = None try: ex_article = self.g.extract(url=link) except Exception: print('failed to extract article..') continue ex_title = ex_article.title ex_body = ex_article.cleaned_text #print ex_title #print ex_body #sys.exit(1) if ex_body == '': self.stats['empty_body_articles'] += 1 continue self._save_article({'title':ex_title, 'body': ex_body, 'category':category}) def _save_article(self,clean_article): print "Saving article %s..." %(clean_article['title'].encode("utf-8")) if self.datastore_type == 'file': self._save_flat_file(clean_article) elif self.datastore_type == 'sqlite': self._save_to_db(clean_article) else: raise Exception("Unsupported datastore type. Please specify file or sqlite") def _remove_punctuations(self,title): # TODO optimize for python 3 #title.translate(title.maketrans("",""), string.punctuation) return "".join(char for char in title if char not in string.punctuation) def _save_flat_file(self,clean_article): directory = self.corpus_dir + '/' + clean_article['category'] # create category directory if not os.path.exists(directory): os.makedirs(directory) file_name = directory + '/' + \ self._remove_punctuations(clean_article['title']).replace(" ","_") + '.json' with io.open(file_name, 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(clean_article, ensure_ascii=False))) def _encode_query(self,query): # TODO Python 3 urllib.parse.quote return urllib.quote(query) def google_news_search(self,query,category_label,num=50): ''' Searches Google News. NOTE: Official Google News API is deprecated https://developers.google.com/news-search/?hl=en NOTE: Google limits the maximum number of documents per query to 100. Use multiple related queries to get a bigger corpus. Args: query (str): The search term. category_label (str): The category to assign to the articles. These categories are the labels in the generated corpus num (Optional[int]): The numnber of results to return. Returns: articles: Array of tuples that contains article link & cateogory ex. [('IPO','www.cs.columbia.edu')] ''' url = 'https://news.google.com/news?hl=en&q='+self._encode_query(query) \ +'&num='+str(num)+'&output=rss' rss = feedparser.parse(url) entries = rss['entries'] articles = [] for entry in entries: link = entry['link'] articles.append((category_label,link)) return articles def _set_up_db(self,db): if os.path.exists(db): print 'Database exists, assume schema does, too.' else: print 'Need to create schema' print 'Creating schema...' conn = sqlite3.connect(db) cur = conn.cursor() cur.execute("create table articles (Id INTEGER PRIMARY KEY,category, title,body)") cur.execute("CREATE UNIQUE INDEX uni_article on articles (category,title)") conn.close() def _save_to_db(self,clean_article): conn = sqlite3.connect(self.db) with conn: cur = conn.cursor() try: cur.execute("INSERT INTO articles (Id,category,title,body)\ VALUES(?, ?, ?,?)",(None,clean_article['category'],clean_article['title'],clean_article['body'])) except sqlite3.IntegrityError: self.stats['not_insert_db'] += 1 print 'Record already inserted with title %s ' %(clean_article['title'].encode("utf-8")) def get_stats(self): return self.stats if __name__ == '__main__': pass