Python fuzzywuzzy.fuzz.partial_ratio() Examples

The following are 30 code examples of fuzzywuzzy.fuzz.partial_ratio(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module fuzzywuzzy.fuzz , or try the search function .
Example #1
Source File: nlp_feature_extraction.py    From kaggle-quora-dup with MIT License 7 votes vote down vote up
def extract_features(df):
    df["question1"] = df["question1"].fillna("").apply(preprocess)
    df["question2"] = df["question2"].fillna("").apply(preprocess)

    print("token features...")
    token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
    df["cwc_min"]       = list(map(lambda x: x[0], token_features))
    df["cwc_max"]       = list(map(lambda x: x[1], token_features))
    df["csc_min"]       = list(map(lambda x: x[2], token_features))
    df["csc_max"]       = list(map(lambda x: x[3], token_features))
    df["ctc_min"]       = list(map(lambda x: x[4], token_features))
    df["ctc_max"]       = list(map(lambda x: x[5], token_features))
    df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
    df["first_word_eq"] = list(map(lambda x: x[7], token_features))
    df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
    df["mean_len"]      = list(map(lambda x: x[9], token_features))

    print("fuzzy features..")
    df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
    df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
    df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
    df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
    df["longest_substr_ratio"]  = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
    return df 
Example #2
Source File: feature_engineering.py    From CIKM-AnalytiCup-2018 with Apache License 2.0 7 votes vote down vote up
def _create_fuzzy_wuzzy_features(self, df):
        df['fuzzy_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_set_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_set_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_partial_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.partial_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_token_sort_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.token_sort_ratio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_qratio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.QRatio(row['spn_1'], row['spn_2']), axis=1)
        df['fuzzy_WRatio'] = df[['spn_1', 'spn_2']].apply(lambda row: fuzz.WRatio(row['spn_1'], row['spn_2']), axis=1)
   
        def _get_longest_substr_ratio(a, b):
            strs = list(distance.lcsubstrings(a, b))
            if len(strs) == 0:
                return 0
            else:
                return len(strs[0]) / (min(len(a), len(b)) + 1)

        df['longest_substr_ratio'] = df[['spn_1', 'spn_2']].apply(lambda row: _get_longest_substr_ratio(row['spn_1'], row['spn_2']), axis=1) 
Example #3
Source File: py_op.py    From HUAWEIOCR-2019 with MIT License 6 votes vote down vote up
def fuzz_list(node1_list,node2_list,score_baseline=66,proposal_num=10,string_map=None):
    node_dict = { }
    for i,node1 in enumerate(node1_list):
        match_score_dict = { }
        for node2 in node2_list:
            if node1 != node2:
                if string_map is not None:
                    n1 = string_map(node1)
                    n2 = string_map(node2)
                    score = fuzz.partial_ratio(n1,n2)
                    if n1 == n2:
                        node2_list.remove(node2)
                else:
                    score = fuzz.partial_ratio(node1,node2)
                if score > score_baseline:
                    match_score_dict[node2] = score
            else:
                node2_list.remove(node2)
        node2_sort = sorted(match_score_dict.keys(), key=lambda k:match_score_dict[k],reverse=True)
        node_dict[node1] = [[n,match_score_dict[n]] for n in node2_sort[:proposal_num]]
        print i,len(node1_list)
    return node_dict, node2_list 
Example #4
Source File: source.py    From bridgy with MIT License 6 votes vote down vote up
def search(self, targets, partial=True, fuzzy=False):
        allInstances = self.instances()
        matchedInstances = set()

        for host in targets:
            for instance in allInstances:
                names = [instance.name]
                if instance.aliases != None:
                    names += list(instance.aliases)
                for name in names:
                    if host.lower() == name.lower():
                        matchedInstances.add((100, instance))
                    elif partial and host.lower() in name.lower():
                        matchedInstances.add((99, instance))

                    if fuzzy:
                        score = fuzz.partial_ratio(host.lower(), name.lower())
                        if score > 85 or host.lower() in name.lower():
                            matchedInstances.add((score, instance))

        # it is possible for the same instance to be matched, if so, it should only
        # appear on the return list once (still ordered by the most probable match)
        return list(collections.OrderedDict([(v, None) for k, v in sorted(list(matchedInstances))]).keys()) 
Example #5
Source File: __init__.py    From simhashpy with Apache License 2.0 6 votes vote down vote up
def add_dup_simhash_caches(simhashcache, dup_obj_ids):
    if not dup_obj_ids:
        return
    old_dup_obj_ids = set(dup_obj_ids)
    start_time = time.time()
    for i, dup_obj_id in enumerate(dup_obj_ids, 1):
        with Timer(msg='fuzzy-like:%d %s' % (i, dup_obj_id)):
            logging.info('--' * 100)
            try:
                dup_simhash = SimHashCache.objects.get(obj_id=dup_obj_id)
            except Exception, e:
                print e
                continue
            sim_ratio = fuzz.partial_ratio(s1=simhashcache.text, s2=dup_simhash.text)
            logging.info(simhashcache.text)
            logging.info('--' * 20)
            logging.info(dup_simhash.text)
            logging.info("%d %s %s" % (sim_ratio, simhashcache.obj_id, dup_simhash.obj_id))

            if dup_simhash not in old_dup_obj_ids:
                if sim_ratio > 50:
                    old_dup_obj_ids.add(dup_obj_id)
            else:
                if sim_ratio <= 50:
                    old_dup_obj_ids.remove(dup_obj_id) 
Example #6
Source File: process_train.py    From CogQA with MIT License 6 votes vote down vote up
def find_near_matches(w, sentence):
    ret = []
    max_ratio = 0
    t = 0
    for word in sentence.split():
        while sentence[t] != word[0]:
            t += 1
        score = (fuzz.ratio(w, word) + fuzz.partial_ratio(w, word)) / 2
        if score > max_ratio:
            max_ratio = score
            ret = [(t, t + len(word))]
        elif score == max_ratio:
            ret.append((t, t + len(word)))
        else:
            pass
        t += len(word)
    return ret if max_ratio > 85 else [] 
Example #7
Source File: main.py    From Tools with MIT License 6 votes vote down vote up
def inquiry(self):
		sentence = self.line_edit.text()
		matched = []
		score_thresh = self.getScoreThresh()
		if not sentence:
			QMessageBox.warning(self, "Warning", '请先输入需要查询的鲁迅名言')
		else:
			for p in self.paragraphs:
				score = fuzz.partial_ratio(p, sentence)
				if score >= score_thresh and len(sentence) <= len(p):
					matched.append([score, p])
			infos = []
			for match in matched:
				infos.append('[匹配度]: %d\n[内容]: %s\n' % (match[0], match[1]))
			if not infos:
				infos.append('未匹配到任何相似度大于%d的句子.\n' % score_thresh)
			self.text.setText('\n\n\n'.join(infos)[:-1]) 
Example #8
Source File: gnome-pass-search-provider.py    From gnome-pass-search-provider with GNU General Public License v3.0 5 votes vote down vote up
def get_result_set(self, terms):
        if terms[0] == "otp":
            field = terms[0]
        elif terms[0].startswith(":"):
            field = terms[0][1:]
            terms = terms[1:]
        else:
            field = None

        name = "".join(terms)
        password_list = []
        for root, dirs, files in walk(self.password_store):
            dir_path = root[len(self.password_store) + 1 :]

            if dir_path.startswith("."):
                continue

            for filename in files:
                if filename[-4:] != ".gpg":
                    continue
                path = path_join(dir_path, filename)[:-4]
                password_list.append(path)

        results = [
            e[0]
            for e in process.extract(
                name, password_list, limit=5, scorer=fuzz.partial_ratio
            )
        ]
        if field == "otp":
            results = [f"otp {r}" for r in results]
        elif field is not None:
            results = [f":{field} {r}" for r in results]
        return results 
Example #9
Source File: models.py    From videocr with MIT License 5 votes vote down vote up
def is_similar_to(self, other: PredictedSubtitle) -> bool:
        return fuzz.partial_ratio(self.text, other.text) >= self.sim_threshold 
Example #10
Source File: checker.py    From XSStrike with GNU General Public License v3.0 5 votes vote down vote up
def checker(url, params, headers, GET, delay, payload, positions, timeout, encoding):
    checkString = 'st4r7s' + payload + '3nd'
    if encoding:
        checkString = encoding(unquote(checkString))
    response = requester(url, replaceValue(
        params, xsschecker, checkString, copy.deepcopy), headers, GET, delay, timeout).text.lower()
    reflectedPositions = []
    for match in re.finditer('st4r7s', response):
        reflectedPositions.append(match.start())
    filledPositions = fillHoles(positions, reflectedPositions)
    #  Itretating over the reflections
    num = 0
    efficiencies = []
    for position in filledPositions:
        allEfficiencies = []
        try:
            reflected = response[reflectedPositions[num]
                :reflectedPositions[num]+len(checkString)]
            efficiency = fuzz.partial_ratio(reflected, checkString.lower())
            allEfficiencies.append(efficiency)
        except IndexError:
            pass
        if position:
            reflected = response[position:position+len(checkString)]
            if encoding:
                checkString = encoding(checkString.lower())
            efficiency = fuzz.partial_ratio(reflected, checkString)
            if reflected[:-2] == ('\\%s' % checkString.replace('st4r7s', '').replace('3nd', '')):
                efficiency = 90
            allEfficiencies.append(efficiency)
            efficiencies.append(max(allEfficiencies))
        else:
            efficiencies.append(0)
        num += 1
    return list(filter(None, efficiencies)) 
Example #11
Source File: uberquation.py    From Apostrophe with GNU General Public License v3.0 5 votes vote down vote up
def sort_func(self, row_a, row_b, data=None):
        if self.b.get_object("searchentry1").get_text():
            if fuzz.partial_ratio(self.current_search, row_a.entry_name.lower()) > \
                fuzz.partial_ratio(self.current_search, row_b.entry_name.lower()):
                return False
            else:
                return True
        return False 
Example #12
Source File: uberquation.py    From Apostrophe with GNU General Public License v3.0 5 votes vote down vote up
def filter_func(self, row, data=None):

        if fuzz.partial_ratio(self.current_search, row.entry_name.lower()) < 80:
            return False 
        # if not row.entry_name.startswith(self.b.get_object("searchentry1").get_text()):
        #     return False
        return True 
Example #13
Source File: preprocess_dataset.py    From tf_CFO with MIT License 5 votes vote down vote up
def reverseLinking(sent, text_candidate):
    tokens = sent.split()
    label = ["O"] * len(tokens)
    text_attention_indices = None
    exact_match = False

    if text_candidate is None or len(text_candidate) == 0:
        return '<UNK>', label, exact_match

    # sorted by length
    for text in sorted(text_candidate, key=lambda x: len(x), reverse=True):
        pattern = r'(^|\s)(%s)($|\s)' % (re.escape(text))
        if re.search(pattern, sent):
            text_attention_indices = get_indices(tokens, text.split())
            break
    if text_attention_indices:
        exact_match = True
        for i in text_attention_indices:
            label[i] = 'I'
    else:
        try:
            v, score = process.extractOne(sent, text_candidate, scorer=fuzz.partial_ratio)
        except :
            print("Extraction Error with FuzzyWuzzy : {} || {}".format(sent, text_candidate))
            return '<UNK>', label, exact_match
        v = v.split()
        n_gram_candidate = get_ngram(tokens)
        n_gram_candidate = sorted(n_gram_candidate, key=lambda x: fuzz.ratio(x[0], v), reverse=True)
        top = n_gram_candidate[0]
        for i in range(top[1], top[2]):
            label[i] = 'I'

    entity_text = []
    for l, t in zip(label, tokens):
        if l == 'I':
            entity_text.append(t)
    entity_text = " ".join(entity_text)
    label = " ".join(label)
    return entity_text, label, exact_match 
Example #14
Source File: server.py    From iWant with MIT License 5 votes vote down vote up
def _leader_lookup(self, data):
        #  TODO: there is absolutely no use of sending uuid of the message initiator
        # uuid, text_search = data
        text_search = data['search_query']
        filtered_response = []
        for uuid in self.factory.data_from_peers.keys():
            for filename in self.factory.data_from_peers[uuid]['filenames']:
                if fuzz.partial_ratio(
                        text_search.lower(),
                        filename.lower()) >= 55:
                    file_hash = self.factory.data_from_peers[
                        uuid]['filenames'][filename]
                    try:
                        filtered_response.append(
                            self.factory.data_from_peers[uuid]['hashes'][file_hash])
                    except Exception as e:
                        print e
                        print_log(
                            'BIGGEST MESS UP {0}'.format(filename),
                            WARNING_LOG)
        if len(self.factory.data_from_peers.keys()) == 0:
            filtered_response = []

        # update_msg = Basemessage(key=SEARCH_RES, data=filtered_response)
        update_msg = bake(SEARCH_RES, search_query_response=filtered_response)
        self.sendLine(update_msg)  # this we are sending it back to the server
        #  leader will loseConnection with the requesting server
        self.transport.loseConnection() 
Example #15
Source File: ErPredictorES.py    From EARL with GNU General Public License v3.0 5 votes vote down vote up
def erPredict(self, chunks):
        erpredictions = []
        combinedchunks = []
        for chunk in chunks:
            wordlist = []
            surfacestart = chunk[0][2]
            for word in chunk:
                wordlist.append(word[0])
                surfacelength = word[2]+word[3] - surfacestart
            wordlist = ' '.join(wordlist)
            combinedchunks.append((wordlist,surfacestart,surfacelength))
         
        for chunk in combinedchunks:
            x = None
            chunkk = chunk[0].encode('ascii','ignore')
            chunkwords = chunkk.translate(None, string.punctuation)
            embedding = self.embed(chunkwords)
            esresult = self.es.search(index="dbentityindex11", body={"query":{"multi_match":{"query":chunkwords,"fields":["wikidataLabel", "dbpediaLabel^1.5"]}},"size":1})
            topresult = esresult['hits']['hits']
            if len(topresult) == 1:
                topresult = topresult[0]
                if 'dbpediaLabel' in topresult['_source']:
                    x = embedding + [topresult['_score']] + [fuzz.ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] + [fuzz.partial_ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0] + [fuzz.token_sort_ratio(chunkwords, topresult['_source']['dbpediaLabel'])/100.0]
                if 'wikidataLabel' in topresult['_source']:
                    x = embedding + [topresult['_score']] + [fuzz.ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] + [fuzz.partial_ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0] + [fuzz.token_sort_ratio(chunkwords, topresult['_source']['wikidataLabel'])/100.0]
            else:
                x = embedding + [0.0,0.0,0.0,0.0]
        #print(x, type(x))
            x = torch.FloatTensor(x)  
            pred = self.ermodel(x)
            print(chunkwords,pred,pred[0])
            if pred[0] >0.5:
                erpredictions.append({'chunk':chunkwords, 'surfacestart': chunk[1], 'surfacelength': chunk[2] , 'class':'entity'})
            else:
                erpredictions.append({'chunk':chunkwords, 'surfacestart': chunk[1], 'surfacelength': chunk[2] , 'class':'relation'})
        return erpredictions 
Example #16
Source File: converter.py    From seasonalbot with MIT License 5 votes vote down vote up
def convert(self, ctx: Context, name: str) -> str:
        """Convert the input snake name to the closest matching Snake object."""
        await self.build_list()
        name = name.lower()

        if name == 'python':
            return 'Python (programming language)'

        def get_potential(iterable: Iterable, *, threshold: int = 80) -> List[str]:
            nonlocal name
            potential = []

            for item in iterable:
                original, item = item, item.lower()

                if name == item:
                    return [original]

                a, b = fuzz.ratio(name, item), fuzz.partial_ratio(name, item)
                if a >= threshold or b >= threshold:
                    potential.append(original)

            return potential

        # Handle special cases
        if name.lower() in self.special_cases:
            return self.special_cases.get(name.lower(), name.lower())

        names = {snake['name']: snake['scientific'] for snake in self.snakes}
        all_names = names.keys() | names.values()
        timeout = len(all_names) * (3 / 4)

        embed = discord.Embed(
            title='Found multiple choices. Please choose the correct one.', colour=0x59982F)
        embed.set_author(name=ctx.author.display_name, icon_url=ctx.author.avatar_url)

        name = await disambiguate(ctx, get_potential(all_names), timeout=timeout, embed=embed)
        return names.get(name, name) 
Example #17
Source File: shuriken_xss.py    From shuriken with MIT License 5 votes vote down vote up
def detect_xss(self, payload, browser_object, user_screenshot_name,
                   injected_link):
        """Check the HTML source to determine if XSS payload was reflected."""
        # If fuzzy detection chosen, evaluate partial reflection of XSS
        # by tokenizing the HTML source and detecting parts of the payload
        # and source common to both.
        #
        # Other methods of scoring include fuzz.ratio(), fuzz.partial_ratio()
        # and fuzz.token_sort_ratio()
        partial_score = fuzz.token_set_ratio(
            payload.lower(), browser_object.html.lower())
        # Set the level of detection asked for by the user, e.g. Only detect
        # matches with score higher than 50% fuzzy detection
        fuzzy_level = self.user_args.FUZZY_DETECTION

        if payload.lower() in browser_object.html.lower():
            print Color.GREEN + "\n[+] XSS vulnerability found:" + \
                Color.END

            # If user set the --screen flag to target, capture screenshot of
            # payload
            if user_screenshot_name is not None:
                self.take_screenshot(user_screenshot_name,
                                     browser_object, self.screen_index)

            # Add link to list of all positive XSS hits
            self.xss_links.append(injected_link)
            print Color.BLUE + injected_link + Color.END
        # If user enabled fuzzy detection and partial score was larger than
        # fuzz level, add it to partials list and print results
        elif fuzzy_level and (partial_score >= fuzzy_level):
            print Color.YELLOW + \
                "\n[-] Partial XSS vulnerability found:" + Color.END
            print Color.BLUE + injected_link + Color.END
            self.xss_partials.append(injected_link)
            print "Detection score: %s" % partial_score
        else:
            print Color.RED + "\n[+] No XSS detected at: \n" + \
                Color.BLUE + injected_link + Color.END
            if (fuzzy_level):
                print "Detection score: %s" % partial_score 
Example #18
Source File: py_op.py    From HUAWEIOCR-2019 with MIT License 5 votes vote down vote up
def myfuzzymatch(srcs,objs,grade=80):
    matchDict = OrderedDict()
    for src in srcs:
        for obj in objs:
            value = fuzz.partial_ratio(src,obj)
            if value > grade:
                try:
                    matchDict[src].append(obj)
                except:
                    matchDict[src] = [obj]
    return matchDict 
Example #19
Source File: bot.py    From app_rasa_chat_bot with MIT License 5 votes vote down vote up
def fuzzy_match_ents(ents, choices, limit=2, thresh=80):
    fuzz_matches_out = []
    for ent in ents:
        top_matches = process.extract(
            ent,
            set(choices),
            limit=limit,
            scorer=fuzz.partial_ratio)
        for match, score in top_matches:
            if score >= thresh:
                fuzz_matches_out.append(match)
    return fuzz_matches_out 
Example #20
Source File: string_utils.py    From ph0neutria with Apache License 2.0 5 votes vote down vote up
def fuzzy_score_string(first_string, second_string):
    """Produce a similarity score for two strings (using Levenshtein distance).

    Params:
    - first_string: (type: string) first string.
    - second_string: (type: string) second string.

    Returns:
    - result: (type: int) score.
    """
    score = 0

    if len(first_string) < len(second_string):
        shorter, longer = (first_string, second_string)
        window_length = len(shorter)

        num_iterations = len(longer) - len(shorter) + 1

        for position in range(0, num_iterations):
            window = longer[position:position + window_length]
            l_ratio = Levenshtein.ratio(window, shorter) * 100

            if l_ratio > 60:
                result = statistics.mean(
                    [100 - Levenshtein.distance(window, shorter) * 15, l_ratio, l_ratio])

            else:
                result = l_ratio

            if result > score:
                score = result

    else:
        l_ratio = Levenshtein.ratio(first_string, second_string) * 100
        score = statistics.mean(
            [100 - Levenshtein.distance(first_string, second_string) * 15, l_ratio, l_ratio])

    simple = fuzz.ratio(first_string, second_string)
    partial = fuzz.partial_ratio(first_string, second_string)
    sort = fuzz.token_sort_ratio(first_string, second_string)
    set_ratio = fuzz.token_set_ratio(first_string, second_string)

    score = max([score, simple, partial, sort, set_ratio])

    if score < 75:
        score = 0

    return score * 0.85 
Example #21
Source File: make_handcrafted_33_features.py    From wsdm19cup with MIT License 5 votes vote down vote up
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
Example #22
Source File: train_predict_trees_batch2.py    From wsdm19cup with MIT License 5 votes vote down vote up
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
Example #23
Source File: train_predict_trees_batch1.py    From wsdm19cup with MIT License 5 votes vote down vote up
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
Example #24
Source File: train_predict_trees_batch3.py    From wsdm19cup with MIT License 5 votes vote down vote up
def extract_stat_features(self,df):
        df["title1_zh"] = df["title1_zh"].fillna("").apply(self.__preprocess__)
        df["title2_zh"] = df["title2_zh"].fillna("").apply(self.__preprocess__)

        print("token features...")
        token_features = df.apply(lambda x: self.__get_token_features__(x["title1_zh"], x["title2_zh"]), axis=1)
        df["cwc_min"]       = list(map(lambda x: x[0], token_features))
        df["cwc_max"]       = list(map(lambda x: x[1], token_features))
        df["csc_min"]       = list(map(lambda x: x[2], token_features))
        df["csc_max"]       = list(map(lambda x: x[3], token_features))
        df["ctc_min"]       = list(map(lambda x: x[4], token_features))
        df["ctc_max"]       = list(map(lambda x: x[5], token_features))
        df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
        df["first_word_eq"] = list(map(lambda x: x[7], token_features))
        df["abs_len_diff"]  = list(map(lambda x: x[8], token_features))
        df["mean_len"]      = list(map(lambda x: x[9], token_features))

        print("fuzzy features..")
        df["token_set_ratio"]       = df.apply(lambda x: fuzz.token_set_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["token_sort_ratio"]      = df.apply(lambda x: fuzz.token_sort_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_ratio"]            = df.apply(lambda x: fuzz.QRatio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["fuzz_partial_ratio"]    = df.apply(lambda x: fuzz.partial_ratio(x["title1_zh"], x["title2_zh"]), axis=1)
        df["longest_substr_ratio"]  = df.apply(lambda x: self.__get_longest_substr_ratio__(x["title1_zh"], x["title2_zh"]), axis=1)
        
        if 'label' in df.columns.tolist():
            return df.drop(["title1_zh", "title2_zh", "label"], axis=1).values
        else:
            return df.drop(["title1_zh", "title2_zh"], axis=1).values 
Example #25
Source File: util.py    From rules-bot with GNU Affero General Public License v3.0 5 votes vote down vote up
def search(self, query):
        def processor(x):
            if isinstance(x, Issue):
                x = x.title
            return x.strip().lower()

        # We don't care about the score, so return first element
        # This must not happen while updating the self.issues dict so acquire the lock
        with self.issues_lock:
            return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio,
                                                            processor=processor, limit=5)] 
Example #26
Source File: fuzz.py    From yui with GNU Affero General Public License v3.0 5 votes vote down vote up
def partial_ratio(str1: str, str2: str) -> int:
    """Get partial fuzzy ratio with korean text"""

    return fuzz.partial_ratio(
        normalize_korean_nfc_to_nfd(str1), normalize_korean_nfc_to_nfd(str2),
    ) 
Example #27
Source File: pubmed_robot.py    From robotreviewer with GNU General Public License v3.0 5 votes vote down vote up
def pdf_annotate(self, data):

        title_text = data.get('title')
        if not title_text:
            log.error('Unable to run pubmed matching since we have no title')
            # unable to do pubmed unless we have a title, so just return the original data
            return data

        vec_q = self.vectorizer.transform([title_text])
        token_overlap = vec_q.dot(self.vec_ti.T)
        self.to = token_overlap
        best_ind = token_overlap.indices[token_overlap.data.argmax()]
        pmid = int(self.pmid_ind[best_ind])

        # checking both the overall similarity, and overlap similarity

        pubmed_data = self.query_pubmed(pmid)

        match_pc = fuzz.ratio(title_text.lower(), pubmed_data['title'].lower())
        match_pc_overlap = fuzz.partial_ratio(title_text.lower(), pubmed_data['title'].lower())

        # seems like a reasonable heuristic but not checked
        # (given that sometimes our query is a partial title
        # retrieved by Grobid)
        pubmed_data['pubmed_match_quality'] = sum([match_pc, match_pc_overlap])

        var_map = [('abstract', pubmed_data['abstract']),
                   ('pmid', pubmed_data['pmid']),
                   ('mesh', pubmed_data['mesh'])]


        if pubmed_data['pubmed_match_quality'] > 180:
            data.data['pubmed'] = pubmed_data # until setattr is worked out
        else:
             # keep it just in case, but don't replace better quality match
             data.data['dubious'] = pubmed_data # until setattr is worked out

        return data 
Example #28
Source File: bolt.py    From Bolt with GNU General Public License v3.0 5 votes vote down vote up
def fuzzy(tokens):
    averages = []
    for token in tokens:
        sameTokenRemoved = False
        result = process.extract(token, tokens, scorer=fuzz.partial_ratio)
        scores = []
        for each in result:
            score = each[1]
            if score == 100 and not sameTokenRemoved:
                sameTokenRemoved = True
                continue
            scores.append(score)
        average = statistics.mean(scores)
        averages.append(average)
    return statistics.mean(averages) 
Example #29
Source File: deduplicate_sources.py    From NBA_Tutorials with MIT License 5 votes vote down vote up
def check_names_fuzzy_match(row):
    row["name_match"] = fuzz.partial_ratio(row["Player"], row["PLAYER_NAME"]) > 60
    return row 
Example #30
Source File: cluster_for_short_text.py    From single-pass-clustering-for-chinese-text with MIT License 5 votes vote down vote up
def write_cluster(cluster_topic, filepath):
    with open(filepath, "w") as w:
        for key, value in cluster_topic.items():
            cluster = [(text, sim) for text, sim in value if text != key]
            cluster = json.dumps(cluster, ensure_ascii=False)
            w.write("{}\t{}\n".format(key, cluster))

#def fuzz_sim(text1, text2, lower=True):
#    if lower:
#        text1, text2 = text1.lower(), text2.lower()
#    partial_ratio = fuzz.partial_ratio(text1, text2)/100
#    simple_ratio = fuzz.ratio(text1, text2)/100
#    return 0.8*partial_ratio + 0.2*simple_ratio