Python fuzzywuzzy.process.extract() Examples

The following are 25 code examples of fuzzywuzzy.process.extract(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module fuzzywuzzy.process , or try the search function

Example #1

Source File: qa_to_oie.py From supervised-oie with MIT License

6 votes

def fuzzy_match_word(word, words, limit):
    """
    Fuzzy find the indexes of word in words, returns a list of indexes which match the
    best return from fuzzy.
    limit controls the number of choices to allow.
    """
    # Try finding exact matches
    exact_matches = set([i for (i, w) in enumerate(words) if w == word])
    if exact_matches:
        logging.debug("Found exact match for {}".format(word))

    # Else, return fuzzy matching
    logging.debug("No exact match for: {}".format(word))
    # Allow some variance which extractOne misses
    # For example: "Armstrong World Industries Inc" in "Armstrong World Industries Inc. agreed in principle to sell its carpet operations to Shaw Industries Inc ."
    best_matches  = [w for (w, s) in process.extract(word, words, processor = semi_process, limit = limit) if (s > 70)]
    logging.debug("Best matches = {}".format(best_matches))
    return list(exact_matches.union([i for (i, w) in enumerate(words) if w in best_matches]))


# Flatten a list of lists

Example #2

Source File: main.py From squeeze-alexa with GNU General Public License v3.0

6 votes

def _genres_from_slots(self, slots: Iterable[str], genres: Iterable[str]):
        def genres_from(g):
            if not g:
                return set()
            res = process.extract(g, genres)[:MAX_GUESSES_PER_SLOT]
            print_d("Raw genre results: {data}", data=res)
            for g, c in res:
                # Exact(ish) matches shouldn't allow other genres
                if c > MinConfidences.SINGLE_GENRE:
                    return {g}
            return {g for g, c in res
                    if g and int(c) >= MinConfidences.MULTI_GENRE}

        # Grr where's my foldl
        results = set()
        for slot in slots:
            results |= genres_from(slot)
        return results

Example #3

Source File: chatbot_fuzzy.py From nlp_xiaojiang with MIT License

6 votes

def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
    '''编辑距离，速度比较慢，比起匹配方法，能够处理字符不一样的问题'''
    collection_new = []
    len_user_input = len(user_input)
    for coll in collection:  # 获取包含一个字符的，如果不包含，就返回错误
        for i in range(len_user_input):
            if user_input[i] in coll:
                collection_new.append(coll)
    if not collection_new:
        return None
    collection_new = list(set(collection_new))

    same_char_list = []
    for collection_new_one in collection_new: # 获取相同字符串多的问题
        count_same_char_one = count_same_char(user_input, collection_new_one)
        same_char_list.append((collection_new_one, count_same_char_one))
    same_char_list.sort(key=lambda x: x[1], reverse=True)
    if len(same_char_list) >= 500:
        same_char_list = same_char_list[0: 500]

    result =  process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
    return result

Example #4

Source File: internal.py From cheat.sh with MIT License

6 votes

def _get_page(self, topic, request_options=None):
        topics_list = self.get_topics_list()
        if topic.startswith(':'):
            topics_list = [x for x in topics_list if x.startswith(':')]
        else:
            topics_list = [x for x in topics_list if not x.startswith(':')]

        if _USING_FUZZYWUZZY:
            possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3]
        else:
            possible_topics = process.extract(topic, topics_list, limit=3, scorer=fuzz.ratio)
        possible_topics_text = "\n".join([("    * %s %s" % x) for x in possible_topics])
        return """
Unknown topic.
Do you mean one of these topics maybe?

%s
    """ % possible_topics_text

Example #5

Source File: tag_ner.py From TaskBot with GNU General Public License v3.0

5 votes

def extract(self, context):
        entities = process.extract(context["query"], self.keywords)
        print(entities)
        entities = filter(lambda x: x[1] >= self.threshold, entities)
        entities = sorted(entities, key=lambda x: x[1] + len(x[0])/10, reverse=True)
        entities = list(map(lambda x: Tag(TAGMAP[x[0]]), entities))

        if len(entities) == 0:
            return None
        return entities[0]

Example #6

Source File: smart_bubble.py From Persimmon with MIT License

5 votes

def search(self, string: str):
        if string:
            results = process.extract(string, self.cache,
                                      limit=len(self.cache))
            self.rv.data = [{'cls_name': block[0], 'cls_': block[2],
                             'bub': self, 'backdrop': self.backdrop,
                             'pin': self.pin, 'block_pos': self.pos}
                            for block in results if block[1] > 50]
        else:
            self.rv.data = [{'cls_name': name, 'cls_': class_, 'bub': self,
                             'backdrop': self.backdrop, 'pin': self.pin,
                             'block_pos': self.pos}
                            for class_, name in self.cache.items()]

Example #7

Source File: fuzzy_search.py From VideoHub with MIT License

5 votes

def fuzzy(search_key, videos, video_titles):
    """
    - Returns a list of closest matching video IDs.
    """
    best_matches = process.extract(search_key, video_titles, limit=10)
    best_match_titles = []
    for match in best_matches:
        best_match_titles.append(match[0])
    best_match_IDs = []
    for title in best_match_titles:
        for ID in videos:
            if title == videos[ID]:
                best_match_IDs.append(ID)

    return best_match_IDs

Example #8

Source File: gnome-pass-search-provider.py From gnome-pass-search-provider with GNU General Public License v3.0

5 votes

def get_result_set(self, terms):
        if terms[0] == "otp":
            field = terms[0]
        elif terms[0].startswith(":"):
            field = terms[0][1:]
            terms = terms[1:]
        else:
            field = None

        name = "".join(terms)
        password_list = []
        for root, dirs, files in walk(self.password_store):
            dir_path = root[len(self.password_store) + 1 :]

            if dir_path.startswith("."):
                continue

            for filename in files:
                if filename[-4:] != ".gpg":
                    continue
                path = path_join(dir_path, filename)[:-4]
                password_list.append(path)

        results = [
            e[0]
            for e in process.extract(
                name, password_list, limit=5, scorer=fuzz.partial_ratio
            )
        ]
        if field == "otp":
            results = [f"otp {r}" for r in results]
        elif field is not None:
            results = [f":{field} {r}" for r in results]
        return results

Example #9

Source File: provider.py From feeluown-core with MIT License

5 votes

def search(self, keyword, **kwargs):
        limit = kwargs.get('limit', 10)
        repr_song_map = dict()
        for song in self.songs:
            key = song.title + ' ' + song.artists_name + str(song.identifier)
            repr_song_map[key] = song
        choices = repr_song_map.keys()
        result = process.extract(keyword, choices, limit=limit)
        result_songs = []
        for each, score in result:
            # if score > 80, keyword is almost included in song key
            if score > 80:
                result_songs.append(repr_song_map[each])
        return LSearchModel(q=keyword, songs=result_songs)

Example #10

Source File: bot.py From app_rasa_chat_bot with MIT License

5 votes

def fuzzy_match_ents(ents, choices, limit=2, thresh=80):
    fuzz_matches_out = []
    for ent in ents:
        top_matches = process.extract(
            ent,
            set(choices),
            limit=limit,
            scorer=fuzz.partial_ratio)
        for match, score in top_matches:
            if score >= thresh:
                fuzz_matches_out.append(match)
    return fuzz_matches_out

Example #11

Source File: ida_fuzzy.py From IDAFuzzy with MIT License

5 votes

def OnFormChange(self, fid):
        if fid == -1:
            # initialize
            pass
        elif fid == -2:
            # terminate
            pass
        elif fid == self.cEChooser.id:
            self.selected_id = self.GetControlValue(self.cEChooser)[0]
        elif fid == self.iStr1.id:
            self.s = self.GetControlValue(self.iStr1)
            self.EChooser.items = []
            if self.s == '':
                self.RefreshField(self.cEChooser)
                return 1
            self.fst.stop()
            self.fst.quit()  #  if you type speedy, FuzzySearch which executed before is not finished here.
            self.fst.terminate_event.set()
            self.fst.wait()
            #self.fst.terminate()  # but last time's FuzzySearch is meaningless, so terminate this. <- little dangerous?

            #stop and quit take time.(and maybe non-blocking)
            #So if you type speedy, some start() call will be ignored.
            #re-create thread solve this.
            self.fst = FuzzySearchThread()
            self.fst.refresh_list.connect(self.refresh_list)
            self.fst.finished.connect(self.finished)
            self.fst.setup(self.s)
            self.fst.start()

            # extracts = process.extract(s, names, limit=10)  # f.iStr1.value won't change until Form.Execute() returns.
        else:
            pass
        return 1

Example #12

Source File: ida_fuzzy.py From IDAFuzzy with MIT License

5 votes

def run(self):
        f = functools.partial(hooked_scorer, terminate_event=self.terminate_event)
        try:
            res = process.extract(self.s, names, limit=LISTLEN, scorer=f)  # f.iStr1.value won't change until Form.Execute() returns.
            extracts = []
            for i in res:
                extracts.append(i[0])
            for i in range(10-len(res)):
                extracts.append("")
            self.refresh_list.emit(*extracts)  # call main Thread's UI function.
        except TerminateException:
            pass
        self.stop()
        self.finished.emit()


# --------------------------------------------------------------------------

Example #13

Source File: utilities.py From estimagic with BSD 3-Clause "New" or "Revised" License

5 votes

def propose_algorithms(requested_algo, algos, number=3):
    """Propose a a number of algorithms based on similarity to the requested algorithm.

    Args:
        requested_algo (str): From the user requested algorithm.
        algos (dict(str, list(str))): Dictionary where keys are the package and values
            are lists of algorithms.
        number (int) : Number of proposals.

    Returns:
        proposals (list(str)): List of proposed algorithms.

    Example:
        >>> algos = {"scipy": ["L-BFGS-B", "TNC"], "nlopt": ["lbfgsb"]}
        >>> propose_algorithms("scipy_L-BFGS-B", algos, number=1)
        ['scipy_L-BFGS-B']
        >>> propose_algorithms("L-BFGS-B", algos, number=2)
        ['scipy_L-BFGS-B', 'nlopt_lbfgsb']

    """
    possibilities = [
        "_".join([origin, algo_name]) for origin in algos for algo_name in algos[origin]
    ]
    proposals_w_probs = fw_process.extract(requested_algo, possibilities, limit=number)
    proposals = [proposal[0] for proposal in proposals_w_probs]

    return proposals

Example #14

Source File: bolt.py From Bolt with GNU General Public License v3.0

5 votes

def fuzzy(tokens):
    averages = []
    for token in tokens:
        sameTokenRemoved = False
        result = process.extract(token, tokens, scorer=fuzz.partial_ratio)
        scores = []
        for each in result:
            score = each[1]
            if score == 100 and not sameTokenRemoved:
                sameTokenRemoved = True
                continue
            scores.append(score)
        average = statistics.mean(scores)
        averages.append(average)
    return statistics.mean(averages)

Example #15

Source File: pass-filter.py From alfred-pass with GNU General Public License v3.0

5 votes

def search_passwords_fuzzy(query):
    ''' Search passwords using the Fuzzy search method using fuzzywuzzy'''
    passwords = list_passwords()
    return [entry[0] for entry in process.extract(query, passwords)]

Example #16

Source File: ADUserCreds.py From armory with GNU General Public License v3.0

5 votes

def search_term(self, txt, pw_count):
        pws = pw_count.keys()
        if type(txt) == str:
            txt = [txt]
        total_matches = 0
        for t in txt:
            matches = [r[0] for r in process.extract(t, pws, limit=None) if r[1] > 75]

            total_matches += sum([pw_count[p]["count"] for p in matches])

        return total_matches

Example #17

Source File: pyinrail.py From pyinrail with MIT License

5 votes

def search_train(self, query):
        """
        search train by name or number
        """
        return [x[0] for x in process.extract(query, self.trains.values())]

Example #18

Source File: pyinrail.py From pyinrail with MIT License

5 votes

def search_station(self, query):
        """
        search station by name or code
        """
        return [x[0] for x in process.extract(query, self.stations)]

Example #19

Source File: util.py From rules-bot with GNU Affero General Public License v3.0

5 votes

def search(self, query):
        def processor(x):
            if isinstance(x, Issue):
                x = x.title
            return x.strip().lower()

        # We don't care about the score, so return first element
        # This must not happen while updating the self.issues dict so acquire the lock
        with self.issues_lock:
            return [result[0] for result in process.extract(query, self.issues, scorer=fuzz.partial_ratio,
                                                            processor=processor, limit=5)]

Example #20

Source File: twistmoe.py From anime-downloader with The Unlicense

5 votes

def search(self, query):
        headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.46 Safari/537.36',
        'x-access-token': '1rj2vRtegS8Y60B3w3qNZm5T2Q0TN2NR'
        }
        # soup = helpers.soupify(helpers.get('https://twist.moe/', allow_redirects=True, headers=headers))
        req = helpers.get('https://twist.moe/api/anime', headers=headers)
        if 'being redirected' in req.text:
            logger.debug('Tring to extract cookie')
            cookie = get_cookie(req)
            logger.debug('Got cookie: ' + cookie)
            headers['cookie'] = cookie
            # XXX: Can't use helpers.get here becuse that one is cached. Investigate
            req = helpers.get('https://twist.moe/api/anime', headers=headers)
        all_anime = req.json()
        animes = []
        for anime in all_anime:
            animes.append(SearchResult(
                title=anime['title'],
                url='https://twist.moe/a/' + anime['slug']['slug'] + '/',
            ))
        animes = [ani[0] for ani in process.extract(query, animes)]
        return animes

Example #21

Source File: tag_ner.py From TaskBot with GNU General Public License v3.0

5 votes

def transform(self, context):
        return self.extract(context)

Example #22

Source File: searcher.py From todxpy with GNU General Public License v2.0

5 votes

def find_index_tag(tag, tlist):
    """
    Returns a list with first element as tag and rest indexes of todos with that tag
    """
    index_list = []
    similar_tags = []
    for i, todo in enumerate(tlist):
        similar_tags = process.extract(tag, todo.tags)
        if len(similar_tags) > 0:
            if len(index_list) == 0:
                index_list.append(similar_tags[0][0])
            if similar_tags[0][1] > 70:
                index_list.append(i)
    return index_list

Example #23

Source File: sudoku_guessing.py From songoku with MIT License

4 votes

def solve_approximate(self, approximate=False):
        'If it finds a sudoku similar to one it has already done, uses its solution'
        string = self.as_string()
        if string in self.already_solved.keys():
            return self.already_solved[string], self.already_solved_numbers[string]

        else:
            # We save the attempts that we already did but were unsuccesful
            if string in self.already_solved_false:
                solved = False
            else:
                solved = sudoku_solving.solve(string)

            # If the sudoku is unsolvable but very similar to one we already did
            # we assume it's the same one but we couldn't quite catch some numbers
            # Approximate is percent-based, 90 = 90%
            if solved is False:
                # Saves this sudoku as false so we don't have to try to solve it every frame
                self.already_solved_false.append(string)

                if self.already_solved.keys():

                    guesses = process.extract(string, self.already_solved.keys())

                    if guesses:

                        # Prioritizes length, then similarity to the guess
                        if approximate is False:
                            best = max(guesses, key=lambda x: (x[1], len(self.already_solved_numbers[x[0]])))[0]
                            return self.already_solved[best], self.already_solved_numbers[best]
                        else:
                            sorty = sorted(guesses, key=lambda x: (len(self.already_solved_numbers[x[0]]), x[1]), reverse=True)
                            for item in sorty:
                                if item[1] > approximate:
                                    # Sort them by length and then get the one with biggest length that has addecuate ratio?
                                    return self.already_solved[item[0]], self.already_solved_numbers[item[0]]
                            else:
                                best = max(guesses, key=lambda x: (x[1], len(self.already_solved_numbers[x[0]])))[0]
                                return self.already_solved[best], self.already_solved_numbers[best]

            # Only saves correct solutions
            if solved is not False:
                # also save the numbers that already exist in the array
                # (so we don't write over them if we can't see them)
                self.already_solved_numbers[string] = self.get_existing_numbers()
                self.already_solved[string] = solved

                return solved, self.already_solved_numbers[string]

        return False, False

Example #24

Source File: chatbot_fuzzy.py From nlp_xiaojiang with MIT License

4 votes

def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
    '''编辑距离，速度比较慢，比起匹配方法，能够处理字符不一样的问题'''

    start_time = time.time()
    # user_input_set = set([user_input_one for user_input_one in user_input])
    user_input_set = [user_input_one for user_input_one in user_input]


    same_char_list = []
    max_data = 0
    max_data_list = []
    count_collection_new_one = 0
    for collection_new_one in collection: # 获取相同字符串多的问题
        count_same_char_one = len([x for x in user_input_set if x in collection_new_one])

        if count_same_char_one > 0:
            same_char_list.append((count_collection_new_one, count_same_char_one))
        if count_same_char_one > max_data:
            max_data_list.append(count_same_char_one)
            max_data = count_same_char_one
        count_collection_new_one += 1

    end_time1 = time.time()
    list_max_count = []
    len_max_data_list = len(max_data_list)
    for x in range(len_max_data_list):  # 获取前20排名
        for k,l in same_char_list:
            if l == max_data_list[len_max_data_list -1 - x]:
                list_max_count.append(qa_list[k]) #问答重这里取出来
        if len(list_max_count) >= 5000:
            list_max_count = list_max_count[0:5000]
            break

    end_time2 = time.time()

    # end_time1: 0.34090662002563477
    # end_time2: 0.4080846309661865

    # end_time1: 0.06417036056518555
    # end_time2: 0.08422374725341797

    # same_char_list.sort(key=lambda x: x[1], reverse=True)
    # if len(same_char_list) >= 20:
    #     same_char_list = same_char_list[0: 20]

    result =  process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
    end_time3 = time.time()

    # print('end_time1: ' + str(end_time1 - start_time))
    # print('end_time2: ' + str(end_time2 - start_time))
    # print('end_time3: ' + str(end_time3 - start_time))

    return result
    # [fuzz.WRatio, fuzz.QRatio,
    #  fuzz.token_set_ratio, fuzz.token_sort_ratio,
    #  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
    #  fuzz.UWRatio, fuzz.UQRatio]

Example #25

Source File: functions.py From avrae with GNU General Public License v3.0

4 votes

def search(list_to_search: list, value, key, cutoff=5, return_key=False, strict=False):
    """Fuzzy searches a list for an object
    result can be either an object or list of objects
    :param list_to_search: The list to search.
    :param value: The value to search for.
    :param key: A function defining what to search for.
    :param cutoff: The scorer cutoff value for fuzzy searching.
    :param return_key: Whether to return the key of the object that matched or the object itself.
    :param strict: If True, will only search for exact matches.
    :returns: A two-tuple (result, strict)"""
    # there is nothing to search
    if len(list_to_search) == 0:
        return [], False

    # full match, return result
    exact_matches = [a for a in list_to_search if value.lower() == key(a).lower()]
    if not (exact_matches or strict):
        partial_matches = [a for a in list_to_search if value.lower() in key(a).lower()]
        if len(partial_matches) > 1 or not partial_matches:
            names = [key(d).lower() for d in list_to_search]
            fuzzy_map = {key(d).lower(): d for d in list_to_search}
            fuzzy_results = [r for r in process.extract(value.lower(), names, scorer=fuzz.ratio) if r[1] >= cutoff]
            fuzzy_sum = sum(r[1] for r in fuzzy_results)
            fuzzy_matches_and_confidences = [(fuzzy_map[r[0]], r[1] / fuzzy_sum) for r in fuzzy_results]

            # display the results in order of confidence
            weighted_results = []
            weighted_results.extend((match, confidence) for match, confidence in fuzzy_matches_and_confidences)
            weighted_results.extend((match, len(value) / len(key(match))) for match in partial_matches)
            sorted_weighted = sorted(weighted_results, key=lambda e: e[1], reverse=True)

            # build results list, unique
            results = []
            for r in sorted_weighted:
                if r[0] not in results:
                    results.append(r[0])
        else:
            results = partial_matches
    else:
        results = exact_matches

    if len(results) > 1:
        if return_key:
            return [key(r) for r in results], False
        else:
            return results, False
    elif not results:
        return [], False
    else:
        if return_key:
            return key(results[0]), True
        else:
            return results[0], True