import logging import re from nltk import tokenize from nltk.stem import WordNetLemmatizer import safe_IO from dictionary import * from safe_IO import * logger = logging.getLogger('FAIDK.Article') class Article(object): def __init__(self, config, file, FLAG): self.name = file self.config = config self.mark = self.config["MARK"] self.file_path = config['MAIN_PATH'] + config['ARTICLES_PATH'] + file self.fix_dic = load_lemmatization_list_to_dic( self.config['LEMMATIZATION_MODE']) self.known_words = self.read_known_words() self.article = read_article_from_file(self.file_path) self.words = self.split_the_article(self.article, self.file_path) self.new_words = self.read_new_words() self.num = len(self.new_words)#会被修改,独占,勿用 self.keys = self.load_keys() if FLAG == '1': self.learn() if FLAG=='2': self.finish() if self.config['OUT_PUT_MARKED_ARTICLE']: self.out_put_markded_article() self.out_put_important_sentences() self.out_put_vocabulary() def out_put_markded_article(self): self.pattern = re.compile(r"(\b"+r"\b|\b".join(self.new_words)+"\b)",flags=re.IGNORECASE) self.marked_article = re.sub(self.pattern,self.mark+r"\1"+self.mark,self.article) self.marked_article = re.sub(r"\n", r"\n\n", self.marked_article) write_marked_article_to_file("./others/",self.name, self.marked_article) def out_put_important_sentences(self): pp_m_article = re.sub(r"\n",r".\n",self.marked_article) sentences = tokenize.sent_tokenize(pp_m_article) i_sentences = [_ if self.pattern.search(_) else None for _ in sentences] write_important_sentances_to_file("./others/",self.name, "\n\n".join(list(filter(None,i_sentences)))) def out_put_vocabulary(self): vocabulary = [] num = len(self.new_words) for i,_ in enumerate(self.new_words): vocabulary.append("#### "+_+"\n\n"+google(_)+"\n\n"+eudic(_)) logger.info("looking up "+_+" ("+str(num-i)+"left)") vocabulary = "\n\n".join(vocabulary) write_vocabulary_to_file("./others/",self.name,vocabulary) def load_keys(self): f = self.config keys = [f['KEY_FOR_KNOW'], f['KEY_FOR_NOT'], f['KEY_FOR_QUIT']] logger.debug(keys) return keys def read_old_words(self, path): try: return read_article_from_file(path) except: logger.info('missing ' + path) return '' def real_word(self, word, LEMMATIZATION_flag=True): ''' find the real word ''' p_forword = re.compile('[a-z,A-Z,\',‘]') word_s = p_forword.findall(word) real_word = ''.join(word_s)#.lower() if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['list', 'both']: try: real_word = self.fix_dic[real_word] except Exception as e: logger.debug(e) pass if LEMMATIZATION_flag and self.config['LEMMATIZATION_MODE'] in ['NLTK', 'both']: wordnet_lemmatizer = WordNetLemmatizer() real_word = wordnet_lemmatizer.lemmatize(real_word) logger.debug(word+'-->'+real_word) return real_word def read_known_words(self): ''' load the word have known ''' try: with open(self.config['MAIN_PATH'] + self.config['OLD_WORDS_PATH'])as f: all_the_words = f.read() except: logger.info('\'' + self.config['MAIN_PATH'] + self.config['OLD_WORDS_PATH'] + '\' missing......') all_the_words = "" known_words = self.split_the_article(all_the_words,LEMMATIZATION_flag=False) logger.info(known_words) num = len(known_words) logger.info('There are ' + str(num) + ' words I have known') return known_words def split_the_article(self, Article, name=None, LEMMATIZATION_flag=True): ''' split the article ''' sep = re.compile('[ \r\n.,'+self.config['SPECIAL_PUNCTUATION']+' ]') logger.debug(sep) words = re.split(sep, Article) filcts = (self.real_word(word,LEMMATIZATION_flag=LEMMATIZATION_flag) for word in words) set_of_words = set(filcts) if name == None: pass else: logger.info('there are {} words in {}'.format( len(set_of_words), name)) logger.debug(set_of_words) return set_of_words def read_new_words(self): ''' read new words from article ''' new_words = self.words - self.known_words num = len(new_words) if num == 0: logger.info('No new word') elif num == 1: logger.info('only 1 new word') else: logger.info(str(num) + ' new words') logger.info(new_words) return sorted(new_words) def learn(self): ''' learn new words & build ''' logger.info('if you know the word {}, else print {}'.format(self.config['KEY_FOR_KNOW'],self.config['KEY_FOR_NOT'])) for word in self.new_words: judge = my_input(word+'('+str(self.num)+' Left)',self.keys) if judge == self.config['KEY_FOR_QUIT']: self.user_exit() return if judge == self.config['KEY_FOR_KNOW']: self.known_words.add(word) self.num -= 1 self.new_words = sorted(set(self.new_words) - self.known_words) if self.new_words: logger.info('new words ({}):'.format(len(self.new_words))) logger.info(self.new_words) self.finish() def user_exit(self): write_each_words( self.config['ARTICLES_PATH'], 'l_'+self.name, list(self.new_words)[-self.num:]) logger.debug('writing left words') logger.debug(self.new_words[-self.num:]) logger.debug('get new words') self.new_words = set(self.new_words[:-self.num])-self.known_words logger.debug(self.new_words) self.finish() def finish(self): CONFIG = self.config NEW_WORDS_EACH_ARTICLE_PATH = CONFIG['MAIN_PATH'] + \ CONFIG['NEW_WORDS_EACH_ARTICLE_PATH'] safe_IO.mv_file(self.file_path, CONFIG['MAIN_PATH'] + CONFIG['OLD_ARTICLES_PATH']) safe_IO.write_each_words( NEW_WORDS_EACH_ARTICLE_PATH, self.name, self.new_words) with open(CONFIG['MAIN_PATH'] + CONFIG['OLD_WORDS_PATH'], 'w') as old: old.write('\n'.join(self.known_words)) logger.debug('write new words to '+CONFIG['MAIN_PATH'] + CONFIG['NEW_WORDS_PATH']) with open(CONFIG['MAIN_PATH'] + CONFIG['NEW_WORDS_PATH'], 'a') as new: new.write('\n'.join(self.new_words))