# -*- coding: utf-8 -*- import sys import os import numpy as np from scipy import spatial from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from nltk.stem.arlstem import ARLSTem #from nltk.stem.isri import ISRIStemmer other stemmer 2005 from sklearn.metrics.pairwise import cosine_similarity, linear_kernel from nltk.tokenize import WordPunctTokenizer import pickle from sklearn.metrics.pairwise import cosine_similarity import nltk class TfidfReader: SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' def __init__(self, P): self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.docs = self.get_answer_canditates(P) docs_stem = [] for doc in self.docs: docs_stem.append(self.stem_string(doc)) self.stopwords = stopwords.words('arabic') self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None) # , stop_words=self.stopwords) self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem) def stem_string(self, str): str_tokens = self.tokenizer.tokenize(str) str_processed = "" for token in str_tokens: has_symbol = False for s in self.SYMBOLS: if s in token: has_symbol = True break if not has_symbol: str_processed += token + " " + self.stemmer.stem(token) +" " return str_processed def concatenateString(self, paragraph, start, length): final_string = paragraph[start] for i in range(1, length): final_string += " " + paragraph[start + i] return final_string def get_answer_canditates(self, paragraph): para_sents = nltk.sent_tokenize(paragraph) candidates = [] for sent in para_sents: para_words = sent.split() for i in range(0, len(para_words)): for j in range(1, min(15, len(para_words) - i + 1)): candidate = self.concatenateString(para_words, i, j) candidates.append(candidate) return candidates def read(self, P , Q): Q = self.stem_string(Q) query_tfidf = self.vectorizer.transform([Q]) similarities_raw = cosine_similarity(self.tfidf_matrix, query_tfidf) similarities = [] for s in similarities_raw: similarities.append(s[0]) max_index = np.argmax(similarities) return self.docs[max_index] def Test_TfidfReader(): P = "My name is hussein and my father work at the university. My mother was a murderer, she killed my father" reader = TfidfReader(P) print(reader.read("My name is hussein and my father work at the university. My mother was a murderer, she killed my father", "at the university. My mother was"))