Python fuzzywuzzy.fuzz.partial_ratio() Examples
The following are 30
code examples of fuzzywuzzy.fuzz.partial_ratio().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
fuzzywuzzy.fuzz
, or try the search function
.
Example #1
Source File: nlp_feature_extraction.py From kaggle-quora-dup with MIT License | 7 votes |
def extract_features(df): df["question1"] = df["question1"].fillna("").apply(preprocess) df["question2"] = df["question2"].fillna("").apply(preprocess) print("token features...") token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1) return df
Example #2
Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0 | 7 votes |
def _create_fuzzy_wuzzy_features(self, df): df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1) df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1) df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1) df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1) df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1) df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1) def _get_longest_substr_ratio(a, b): strs = list(distance.lcsubstrings(a, b)) if len(strs) == 0: return 0 else: return len(strs[0]) / (min(len(a), len(b)) + 1) df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1)
Example #3
Source File: py_op.py From HUAWEIOCR-2019 with MIT License | 6 votes |
def fuzz_list(node1_list,node2_list,score_baseline=66,proposal_num=10,string_map=None): node_dict = { } for i,node1 in enumerate(node1_list): match_score_dict = { } for node2 in node2_list: if node1 != node2: if string_map is not None: n1 = string_map(node1) n2 = string_map(node2) score = fuzz.partial_ratio(n1,n2) if n1 == n2: node2_list.remove(node2) else: score = fuzz.partial_ratio(node1,node2) if score > score_baseline: match_score_dict[node2] = score else: node2_list.remove(node2) node2_sort = sorted(match_score_dict.keys(), key=lambda k:match_score_dict[k],reverse=True) node_dict[node1] = [[n,match_score_dict[n]] for n in node2_sort[:proposal_num]] print i,len(node1_list) return node_dict, node2_list
Example #4
Source File: source.py From bridgy with MIT License | 6 votes |
def search(self, targets, partial=True, fuzzy=False): allInstances = self.instances() matchedInstances = set() for host in targets: for instance in allInstances: names = [instance.name] if instance.aliases != None: names += list(instance.aliases) for name in names: if host.lower() == name.lower(): matchedInstances.add((100, instance)) elif partial and host.lower() in name.lower(): matchedInstances.add((99, instance)) if fuzzy: score = fuzz.partial_ratio(host.lower(), name.lower()) if score > 85 or host.lower() in name.lower(): matchedInstances.add((score, instance)) # it is possible for the same instance to be matched, if so, it should only # appear on the return list once (still ordered by the most probable match) return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys())
Example #5
Source File: __init__.py From simhashpy with Apache License 2.0 | 6 votes |
def add_dup_simhash_caches(simhashcache, dup_obj_ids): if not dup_obj_ids: return old_dup_obj_ids = set(dup_obj_ids) start_time = time.time() for i, dup_obj_id in enumerate(dup_obj_ids, 1): with Timer(msg='fuzzy-like:%d %s' % (i, dup_obj_id)): logging.info('--' * 100) try: dup_simhash = SimHashCache.objects.get(obj_id=dup_obj_id) except Exception, e: print e continue sim_ratio = fuzz.partial_ratio(s1=simhashcache.text, s2=dup_simhash.text) logging.info(simhashcache.text) logging.info('--' * 20) logging.info(dup_simhash.text) logging.info("%d %s %s" % (sim_ratio, simhashcache.obj_id, dup_simhash.obj_id)) if dup_simhash not in old_dup_obj_ids: if sim_ratio > 50: old_dup_obj_ids.add(dup_obj_id) else: if sim_ratio <= 50: old_dup_obj_ids.remove(dup_obj_id)
Example #6
Source File: process_train.py From CogQA with MIT License | 6 votes |
def find_near_matches(w, sentence): ret = [] max_ratio = 0 t = 0 for word in sentence.split(): while sentence[t] != word[0]: t += 1 score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2 if score > max_ratio: max_ratio = score ret = [(t, t + len(word))] elif score == max_ratio: ret.append((t, t + len(word))) else: pass t += len(word) return ret if max_ratio > 85 else []
Example #7
Source File: main.py From Tools with MIT License | 6 votes |
def inquiry(self): sentence = self.line_edit.text() matched = [] score_thresh = self.getScoreThresh() if not sentence: QMessageBox.warning(self, "Warning", '请先输入需要查询的鲁迅名言') else: for p in self.paragraphs: score = fuzz.partial_ratio(p, sentence) if score >= score_thresh and len(sentence) <= len(p): matched.append([score, p]) infos = [] for match in matched: infos.append('[匹配度]: %d\n[内容]: %s\n' % (match[0], match[1])) if not infos: infos.append('未匹配到任何相似度大于%d的句子.\n' % score_thresh) self.text.setText('\n\n\n'.join(infos)[:-1])
Example #8
Source File: gnome-pass-search-provider.py From gnome-pass-search-provider with GNU General Public License v3.0 | 5 votes |
def get_result_set(self, terms): if terms[0] == "otp": field = terms[0] elif terms[0].startswith(":"): field = terms[0][1:] terms = terms[1:] else: field = None name = "".join(terms) password_list = [] for root, dirs, files in walk(self.password_store): dir_path = root[len(self.password_store) + 1 :] if dir_path.startswith("."): continue for filename in files: if filename[-4:] != ".gpg": continue path = path_join(dir_path, filename)[:-4] password_list.append(path) results = [ e[0] for e in process.extract( name, password_list, limit=5, scorer=fuzz.partial_ratio ) ] if field == "otp": results = [f"otp {r}" for r in results] elif field is not None: results = [f":{field} {r}" for r in results] return results
Example #9
Source File: models.py From videocr with MIT License | 5 votes |
def is_similar_to(self, other: PredictedSubtitle) -> bool: return fuzz.partial_ratio(self.text, other.text) >= self.sim_threshold
Example #10
Source File: checker.py From XSStrike with GNU General Public License v3.0 | 5 votes |
def checker(url, params, headers, GET, delay, payload, positions, timeout, encoding): checkString = 'st4r7s' + payload + '3nd' if encoding: checkString = encoding(unquote(checkString)) response = requester(url, replaceValue( params, xsschecker, checkString, copy.deepcopy), headers, GET, delay, timeout).text.lower() reflectedPositions = [] for match in re.finditer('st4r7s', response): reflectedPositions.append(match.start()) filledPositions = fillHoles(positions, reflectedPositions) # Itretating over the reflections num = 0 efficiencies = [] for position in filledPositions: allEfficiencies = [] try: reflected = response[reflectedPositions[num] :reflectedPositions[num]+len(checkString)] efficiency = fuzz.partial_ratio(reflected, checkString.lower()) allEfficiencies.append(efficiency) except IndexError: pass if position: reflected = response[position:position+len(checkString)] if encoding: checkString = encoding(checkString.lower()) efficiency = fuzz.partial_ratio(reflected, checkString) if reflected[:-2] == ('\\%s' % checkString.replace('st4r7s', '').replace('3nd', '')): efficiency = 90 allEfficiencies.append(efficiency) efficiencies.append(max(allEfficiencies)) else: efficiencies.append(0) num += 1 return list(filter(None, efficiencies))
Example #11
Source File: uberquation.py From Apostrophe with GNU General Public License v3.0 | 5 votes |
def sort_func(self, row_a, row_b, data=None): if self.b.get_object("searchentry1").get_text(): if fuzz.partial_ratio(self.current_search, row_a.entry_name.lower()) > \ fuzz.partial_ratio(self.current_search, row_b.entry_name.lower()): return False else: return True return False
Example #12
Source File: uberquation.py From Apostrophe with GNU General Public License v3.0 | 5 votes |
def filter_func(self, row, data=None): if fuzz.partial_ratio(self.current_search, row.entry_name.lower()) < 80: return False # if not row.entry_name.startswith(self.b.get_object("searchentry1").get_text()): # return False return True
Example #13
Source File: preprocess_dataset.py From tf_CFO with MIT License | 5 votes |
def reverseLinking(sent, text_candidate): tokens = sent.split() label = ["O"] * len(tokens) text_attention_indices = None exact_match = False if text_candidate is None or len(text_candidate) == 0: return '<UNK>', label, exact_match # sorted by length for text in sorted(text_candidate, key=lambda x: len(x), reverse=True): pattern = r'(^|\s)(%s)($|\s)' % (re.escape(text)) if re.search(pattern, sent): text_attention_indices = get_indices(tokens, text.split()) break if text_attention_indices: exact_match = True for i in text_attention_indices: label[i] = 'I' else: try: v, score = process.extractOne(sent, text_candidate, scorer=fuzz.partial_ratio) except : print("Extraction Error with FuzzyWuzzy : {} || {}".format(sent, text_candidate)) return '<UNK>', label, exact_match v = v.split() n_gram_candidate = get_ngram(tokens) n_gram_candidate = sorted(n_gram_candidate, key=lambda x: fuzz.ratio(x[0], v), reverse=True) top = n_gram_candidate[0] for i in range(top[1], top[2]): label[i] = 'I' entity_text = [] for l, t in zip(label, tokens): if l == 'I': entity_text.append(t) entity_text = " ".join(entity_text) label = " ".join(label) return entity_text, label, exact_match
Example #14
Source File: server.py From iWant with MIT License | 5 votes |
def _leader_lookup(self, data): # TODO: there is absolutely no use of sending uuid of the message initiator # uuid, text_search = data text_search = data['search_query'] filtered_response = [] for uuid in self.factory.data_from_peers.keys(): for filename in self.factory.data_from_peers[uuid]['filenames']: if fuzz.partial_ratio( text_search.lower(), filename.lower()) >= 55: file_hash = self.factory.data_from_peers[ uuid]['filenames'][filename] try: filtered_response.append( self.factory.data_from_peers[uuid]['hashes'][file_hash]) except Exception as e: print e print_log( 'BIGGEST MESS UP {0}'.format(filename), WARNING_LOG) if len(self.factory.data_from_peers.keys()) == 0: filtered_response = [] # update_msg = Basemessage(key=SEARCH_RES, data=filtered_response) update_msg = bake(SEARCH_RES, search_query_response=filtered_response) self.sendLine(update_msg) # this we are sending it back to the server # leader will loseConnection with the requesting server self.transport.loseConnection()
Example #15
Source File: ErPredictorES.py From EARL with GNU General Public License v3.0 | 5 votes |
def erPredict(self, chunks): erpredictions = [] combinedchunks = [] for chunk in chunks: wordlist = [] surfacestart = chunk[0][2] for word in chunk: wordlist.append(word[0]) surfacelength = word[2]+word[3] - surfacestart wordlist = ' '.join(wordlist) combinedchunks.append((wordlist,surfacestart,surfacelength)) for chunk in combinedchunks: x = None chunkk = chunk[0].encode('ascii','ignore') chunkwords = chunkk.translate(None, string.punctuation) embedding = self.embed(chunkwords) esresult = self.es.search(index="dbentityindex11", body={"query":{"multi_match":{"query":chunkwords,"fields":["wikidataLabel", "dbpediaLabel^1.5"]}},"size":1}) topresult = esresult['hits']['hits'] if len(topresult) == 1: topresult = topresult[0] if 'dbpediaLabel' in topresult['_source']: x = embedding + [topresult['_score']] + [fuzz.ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] + [fuzz.partial_ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] + [fuzz.token_sort_ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] if 'wikidataLabel' in topresult['_source']: x = embedding + [topresult['_score']] + [fuzz.ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] + [fuzz.partial_ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] + [fuzz.token_sort_ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] else: x = embedding + [0.0,0.0,0.0,0.0] #print(x, type(x)) x = torch.FloatTensor(x) pred = self.ermodel(x) print(chunkwords,pred,pred[0]) if pred[0] >0.5: erpredictions.append({'chunk':chunkwords, 'surfacestart': chunk[1], 'surfacelength': chunk[2] , 'class':'entity'}) else: erpredictions.append({'chunk':chunkwords, 'surfacestart': chunk[1], 'surfacelength': chunk[2] , 'class':'relation'}) return erpredictions
Example #16
Source File: converter.py From seasonalbot with MIT License | 5 votes |
def convert(self, ctx: Context, name: str) -> str: """Convert the input snake name to the closest matching Snake object.""" await self.build_list() name = name.lower() if name == 'python': return 'Python (programming language)' def get_potential(iterable: Iterable, *, threshold: int = 80) -> List[str]: nonlocal name potential = [] for item in iterable: original, item = item, item.lower() if name == item: return [original] a, b = fuzz.ratio(name, item), fuzz.partial_ratio(name, item) if a >= threshold or b >= threshold: potential.append(original) return potential # Handle special cases if name.lower() in self.special_cases: return self.special_cases.get(name.lower(), name.lower()) names = {snake['name']: snake['scientific'] for snake in self.snakes} all_names = names.keys() | names.values() timeout = len(all_names) * (3 / 4) embed = discord.Embed( title='Found multiple choices. Please choose the correct one.', colour=0x59982F) embed.set_author(name=ctx.author.display_name, icon_url=ctx.author.avatar_url) name = await disambiguate(ctx, get_potential(all_names), timeout=timeout, embed=embed) return names.get(name, name)
Example #17
Source File: shuriken_xss.py From shuriken with MIT License | 5 votes |
def detect_xss(self, payload, browser_object, user_screenshot_name, injected_link): """Check the HTML source to determine if XSS payload was reflected.""" # If fuzzy detection chosen, evaluate partial reflection of XSS # by tokenizing the HTML source and detecting parts of the payload # and source common to both. # # Other methods of scoring include fuzz.ratio(), fuzz.partial_ratio() # and fuzz.token_sort_ratio() partial_score = fuzz.token_set_ratio( payload.lower(), browser_object.html.lower()) # Set the level of detection asked for by the user, e.g. Only detect # matches with score higher than 50% fuzzy detection fuzzy_level = self.user_args.FUZZY_DETECTION if payload.lower() in browser_object.html.lower(): print Color.GREEN + "\n[+] XSS vulnerability found:" + \ Color.END # If user set the --screen flag to target, capture screenshot of # payload if user_screenshot_name is not None: self.take_screenshot(user_screenshot_name, browser_object, self.screen_index) # Add link to list of all positive XSS hits self.xss_links.append(injected_link) print Color.BLUE + injected_link + Color.END # If user enabled fuzzy detection and partial score was larger than # fuzz level, add it to partials list and print results elif fuzzy_level and (partial_score >= fuzzy_level): print Color.YELLOW + \ "\n[-] Partial XSS vulnerability found:" + Color.END print Color.BLUE + injected_link + Color.END self.xss_partials.append(injected_link) print "Detection score: %s" % partial_score else: print Color.RED + "\n[+] No XSS detected at: \n" + \ Color.BLUE + injected_link + Color.END if (fuzzy_level): print "Detection score: %s" % partial_score
Example #18
Source File: py_op.py From HUAWEIOCR-2019 with MIT License | 5 votes |
def myfuzzymatch(srcs,objs,grade=80): matchDict = OrderedDict() for src in srcs: for obj in objs: value = fuzz.partial_ratio(src,obj) if value > grade: try: matchDict[src].append(obj) except: matchDict[src] = [obj] return matchDict
Example #19
Source File: bot.py From app_rasa_chat_bot with MIT License | 5 votes |
def fuzzy_match_ents(ents, choices, limit=2, thresh=80): fuzz_matches_out = [] for ent in ents: top_matches = process.extract( ent, set(choices), limit=limit, scorer=fuzz.partial_ratio) for match, score in top_matches: if score >= thresh: fuzz_matches_out.append(match) return fuzz_matches_out
Example #20
Source File: string_utils.py From ph0neutria with Apache License 2.0 | 5 votes |
def fuzzy_score_string(first_string, second_string): """Produce a similarity score for two strings (using Levenshtein distance). Params: - first_string: (type: string) first string. - second_string: (type: string) second string. Returns: - result: (type: int) score. """ score = 0 if len(first_string) < len(second_string): shorter, longer = (first_string, second_string) window_length = len(shorter) num_iterations = len(longer) - len(shorter) + 1 for position in range(0, num_iterations): window = longer[position:position + window_length] l_ratio = Levenshtein.ratio(window, shorter) * 100 if l_ratio > 60: result = statistics.mean( [100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio]) else: result = l_ratio if result > score: score = result else: l_ratio = Levenshtein.ratio(first_string, second_string) * 100 score = statistics.mean( [100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio]) simple = fuzz.ratio(first_string, second_string) partial = fuzz.partial_ratio(first_string, second_string) sort = fuzz.token_sort_ratio(first_string, second_string) set_ratio = fuzz.token_set_ratio(first_string, second_string) score = max([score, simple, partial, sort, set_ratio]) if score < 75: score = 0 return score * 0.85
Example #21
Source File: make_handcrafted_33_features.py From wsdm19cup with MIT License | 5 votes |
def extract_stat_features(self,df): df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__) df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__) print("token features...") token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1) if 'label' in df.columns.tolist(): return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values else: return df.drop(["title1_zh", "title2_zh"], axis=1).values
Example #22
Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License | 5 votes |
def extract_stat_features(self,df): df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__) df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__) print("token features...") token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1) if 'label' in df.columns.tolist(): return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values else: return df.drop(["title1_zh", "title2_zh"], axis=1).values
Example #23
Source File: train_predict_trees_batch1.py From wsdm19cup with MIT License | 5 votes |
def extract_stat_features(self,df): df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__) df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__) print("token features...") token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1) if 'label' in df.columns.tolist(): return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values else: return df.drop(["title1_zh", "title2_zh"], axis=1).values
Example #24
Source File: train_predict_trees_batch3.py From wsdm19cup with MIT License | 5 votes |
def extract_stat_features(self,df): df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__) df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__) print("token features...") token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1) if 'label' in df.columns.tolist(): return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values else: return df.drop(["title1_zh", "title2_zh"], axis=1).values
Example #25
Source File: util.py From rules-bot with GNU Affero General Public License v3.0 | 5 votes |
def search(self, query): def processor(x): if isinstance(x, Issue): x = x.title return x.strip().lower() # We don't care about the score, so return first element # This must not happen while updating the self.issues dict so acquire the lock with self.issues_lock: return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio, processor=processor, limit=5)]
Example #26
Source File: fuzz.py From yui with GNU Affero General Public License v3.0 | 5 votes |
def partial_ratio(str1: str, str2: str) -> int: """Get partial fuzzy ratio with korean text""" return fuzz.partial_ratio( normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2), )
Example #27
Source File: pubmed_robot.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def pdf_annotate(self, data): title_text = data.get('title') if not title_text: log.error('Unable to run pubmed matching since we have no title') # unable to do pubmed unless we have a title, so just return the original data return data vec_q = self.vectorizer.transform([title_text]) token_overlap = vec_q.dot(self.vec_ti.T) self.to = token_overlap best_ind = token_overlap.indices[token_overlap.data.argmax()] pmid = int(self.pmid_ind[best_ind]) # checking both the overall similarity, and overlap similarity pubmed_data = self.query_pubmed(pmid) match_pc = fuzz.ratio(title_text.lower(), pubmed_data['title'].lower()) match_pc_overlap = fuzz.partial_ratio(title_text.lower(), pubmed_data['title'].lower()) # seems like a reasonable heuristic but not checked # (given that sometimes our query is a partial title # retrieved by Grobid) pubmed_data['pubmed_match_quality'] = sum([match_pc, match_pc_overlap]) var_map = [('abstract', pubmed_data['abstract']), ('pmid', pubmed_data['pmid']), ('mesh', pubmed_data['mesh'])] if pubmed_data['pubmed_match_quality'] > 180: data.data['pubmed'] = pubmed_data # until setattr is worked out else: # keep it just in case, but don't replace better quality match data.data['dubious'] = pubmed_data # until setattr is worked out return data
Example #28
Source File: bolt.py From Bolt with GNU General Public License v3.0 | 5 votes |
def fuzzy(tokens): averages = [] for token in tokens: sameTokenRemoved = False result = process.extract(token, tokens, scorer=fuzz.partial_ratio) scores = [] for each in result: score = each[1] if score == 100 and not sameTokenRemoved: sameTokenRemoved = True continue scores.append(score) average = statistics.mean(scores) averages.append(average) return statistics.mean(averages)
Example #29
Source File: deduplicate_sources.py From NBA_Tutorials with MIT License | 5 votes |
def check_names_fuzzy_match(row): row["name_match"] = fuzz.partial_ratio(row["Player"], row["PLAYER_NAME"]) > 60 return row
Example #30
Source File: cluster_for_short_text.py From single-pass-clustering-for-chinese-text with MIT License | 5 votes |
def write_cluster(cluster_topic, filepath): with open(filepath, "w") as w: for key, value in cluster_topic.items(): cluster = [(text, sim) for text, sim in value if text != key] cluster = json.dumps(cluster, ensure_ascii=False) w.write("{}\t{}\n".format(key, cluster)) #def fuzz_sim(text1, text2, lower=True): # if lower: # text1, text2 = text1.lower(), text2.lower() # partial_ratio = fuzz.partial_ratio(text1, text2)/100 # simple_ratio = fuzz.ratio(text1, text2)/100 # return 0.8*partial_ratio + 0.2*simple_ratio