Python regex.search() Examples

The following are 30 code examples of regex.search(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module regex , or try the search function

Example #1

Source File: client.py From ibis with Apache License 2.0

6 votes

def _ibis_sqlite_regex_extract(string, pattern, index):
    """Extract match of regular expression `pattern` from `string` at `index`.

    Parameters
    ----------
    string : str
    pattern : str
    index : int

    Returns
    -------
    result : str or None
    """
    result = re.search(pattern, string)
    if result is not None and 0 <= index <= (result.lastindex or -1):
        return result.group(index)
    return None

Example #2

Source File: regex_lib_helper.py From JusticeAI with MIT License

6 votes

def cluster_regex_mapper(folder_name, min_match_percentage, nb_of_files=-1):
    """
    This function searches through a given folder_name in order to find all regex-cluster pair and store them in a dict
    :param folder_name: cluster folder to search in (fact or demand)
    :param min_match_percentage: min percentage of sentence in a cluster file that needs to match a regex
    :param nb_of_files: number of files to search through in the folder
    :return: dict of regex-cluster file match
    """
    nb_of_files_proccessed = 0
    path = Path.cluster_directory + folder_name + '/'
    cluster_regex_dict = {}
    for file_name in os.listdir(path):
        # Ignoring this file because this file contains sentences that could not be clustered
        if file_name == '-1.txt':
            continue
        if nb_of_files != -1 and nb_of_files_proccessed > nb_of_files:
            break
        nb_of_files_proccessed += 1
        for regex in RegexLib.regex_facts:
            if cluster_file_finder(regex[regex_name_index], min_match_percentage, path + file_name):
                if regex[regex_name_index] in cluster_regex_dict.keys():
                    cluster_regex_dict[regex[regex_name_index]].append(file_name)
                cluster_regex_dict[regex[regex_name_index]] = [file_name]
    return cluster_regex_dict

Example #3

Source File: regex_lib_helper.py From JusticeAI with MIT License

6 votes

def cluster_file_finder(regex_name, min_match_percentage, file_path):
    """
    Given a file path and a regex name, this function validates that at least min_match_percentage (ex: 50%)
    of the sentence matches the regex
    :param regex_name: name of the regex to match with
    :param min_match_percentage: min percentage of matches required
    :param file_path: cluster file path (where the sentences are)
    :return: True if minimum percentage of sentences do matches the given regex
    """
    regexes = get_regexes(regex_name)
    total_nb_lines_in_file = 0
    total_lines_matched = 0
    file = open(file_path, 'r', encoding='ISO-8859-1')
    for line in file:
        if line == '\n':
            break
        total_nb_lines_in_file += 1
        line = '[1] ' + line
        for reg in regexes:
            if reg.search(line):
                total_lines_matched += 1
    file.close()
    if total_lines_matched > 0 and total_lines_matched / total_nb_lines_in_file > min_match_percentage:
        return True
    return False

Example #4

Source File: regex_lib_helper.py From JusticeAI with MIT License

6 votes

def sentence_finder(regex_name, nb_of_files):
    """
    finds sentences that matches the regex_name
    :param regex_name: name of the regex ex: landlord_money_cover_rent
    :param nb_of_files: number of files to search through
    :return: list of sentences that matched this regex
    """
    regexes = get_regexes(regex_name)
    count = 0
    sentences_matched = []
    for i in os.listdir(Path.raw_data_directory):
        if count > nb_of_files:
            break
        count += 1
        file = open(Path.raw_data_directory + i, "r", encoding="ISO-8859-1")
        for line in file:
            for reg in regexes:
                if reg.search(line):
                    sentences_matched.append(line)
        file.close()
    return sentences_matched

Example #5

Source File: extractors.py From chepy with GNU General Public License v3.0

6 votes

def extract_urls(self, is_binary: bool = False):
        """Extract urls including http, file, ssh and ftp

        Args:
            is_binary (bool, optional): The state is in binary format. It will then first 
                extract the strings from it before matching.
        
        Returns:
            Chepy: The Chepy object. 
        """
        pattern = b"(file|ftps?|http[s]?|ssh)://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
        if is_binary:  # pragma: no cover
            matched = list(
                filter(lambda x: re.search(pattern, x), self.extract_strings().o)
            )
        else:
            matched = list(
                filter(
                    lambda x: re.search(pattern, x), self._convert_to_bytes().split()
                )
            )
        self.state = matched
        return self

Example #6

Source File: utils.py From dragonfly with GNU Lesser General Public License v3.0

6 votes

def _phrase_to_regex(phrase):
    # Treat whitespace between words as meaning anything other than alphanumeric
    # characters.
    pattern = r"[^\w--_]+".join(regex.escape(word) for word in phrase.split())
    # Treat spaces at the beginning or end of the phrase as matching any
    # whitespace character. This makes it easy to select stuff like non-breaking
    # space, which occurs frequently in browsers.
    # TODO Support newlines. Note that these are frequently implemented as
    # separate text nodes in the accessibility tree, so the obvious
    # implementation would not work well.
    if phrase == " ":
        pattern = r"\s"
    else:
        if phrase.startswith(" "):
            pattern = r"\s" + pattern
        if phrase.endswith(" "):
            pattern = pattern + r"\s"
    # Only match at boundaries of alphanumeric sequences if the phrase ends
    # are alphanumeric.
    if regex.search(r"^[\w--_]", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = r"(?<![\w--_])" + pattern
    if regex.search(r"[\w--_]$", phrase, regex.VERSION1 | regex.UNICODE):
        pattern = pattern + r"(?![\w--_])"
    return pattern

Example #7

Source File: util.py From ConvLab with MIT License

6 votes

def prepath_to_spec(prepath):
    '''
    Given a prepath, read the correct spec recover the meta_spec that will return the same prepath for eval lab modes
    example: output/a2c_cartpole_2018_06_13_220436/a2c_cartpole_t0_s0
    '''
    predir, _, prename, _, experiment_ts, ckpt = prepath_split(prepath)
    sidx_res = re.search('_s\d+', prename)
    if sidx_res:  # replace the _s0 if any
        prename = prename.replace(sidx_res[0], '')
    spec_path = f'{predir}/{prename}_spec.json'
    # read the spec of prepath
    spec = read(spec_path)
    # recover meta_spec
    trial_index, session_index = prepath_to_idxs(prepath)
    meta_spec = spec['meta']
    meta_spec['experiment_ts'] = experiment_ts
    meta_spec['ckpt'] = ckpt
    meta_spec['experiment'] = 0
    meta_spec['trial'] = trial_index
    meta_spec['session'] = session_index
    check_prepath = get_prepath(spec, unit='session')
    assert check_prepath in prepath, f'{check_prepath}, {prepath}'
    return spec

Example #8

Source File: pygrok.py From pygrok with MIT License

6 votes

def _load_search_pattern(self):
        self.type_mapper = {}
        py_regex_pattern = self.pattern
        while True:
            # Finding all types specified in the groks
            m = re.findall(r'%{(\w+):(\w+):(\w+)}', py_regex_pattern)
            for n in m:
                self.type_mapper[n[1]] = n[2]
            #replace %{pattern_name:custom_name} (or %{pattern_name:custom_name:type}
            # with regex and regex group name

            py_regex_pattern = re.sub(r'%{(\w+):(\w+)(?::\w+)?}',
                lambda m: "(?P<" + m.group(2) + ">" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            #replace %{pattern_name} with regex
            py_regex_pattern = re.sub(r'%{(\w+)}',
                lambda m: "(" + self.predefined_patterns[m.group(1)].regex_str + ")",
                py_regex_pattern)

            if re.search('%{\w+(:\w+)?}', py_regex_pattern) is None:
                break

        self.regex_obj = re.compile(py_regex_pattern)

Example #9

Source File: validate.py From tools with GNU General Public License v2.0

6 votes

def validate_single_subject(id, tree):
    """
    No predicate should have more than one subject.
    An xcomp dependent normally has no subject, but in some languages the
    requirement may be weaker: it could have an overt subject if it is
    correferential with a particular argument of the matrix verb. Hence we do
    not check zero subjects of xcomp dependents at present.
    Furthermore, in some situations we must allow two subjects (but not three or more).
    If a clause acts as a nonverbal predicate of another clause, and if there is
    no copula, then we must attach two subjects to the predicate of the inner
    clause: one is the predicate of the inner clause, the other is the predicate
    of the outer clause. This could in theory be recursive but in practice it isn't.
    See also issue 34 (https://github.com/UniversalDependencies/tools/issues/34).
    """
    subjects = sorted([x for x in tree['children'][id] if re.search(r"subj", lspec2ud(tree['nodes'][x][DEPREL]))])
    if len(subjects) > 2:
        # We test for more than 2, but in the error message we still say more than 1, so that we do not have to explain the exceptions.
        testlevel = 3
        testclass = 'Syntax'
        testid = 'too-many-subjects'
        testmessage = "Node has more than one subject: %s" % str(subjects)
        warn(testmessage, testclass, testlevel=testlevel, testid=testid, nodeid=id, nodelineno=tree['linenos'][id])

Example #10

Source File: pygrok.py From pygrok with MIT License

6 votes

def match(self, text):
        """If text is matched with pattern, return variable names specified(%{pattern:variable name})
        in pattern and their corresponding values.If not matched, return None.
        custom patterns can be passed in by custom_patterns(pattern name, pattern regular expression pair)
        or custom_patterns_dir.
        """

        match_obj = None
        if self.fullmatch:
            match_obj = self.regex_obj.fullmatch(text)
        else:
            match_obj = self.regex_obj.search(text)

        if match_obj == None:
            return None
        matches = match_obj.groupdict()
        for key,match in matches.items():
            try:
                if self.type_mapper[key] == 'int':
                    matches[key] = int(match)
                if self.type_mapper[key] == 'float':
                    matches[key] = float(match)
            except (TypeError, KeyError) as e:
                pass
        return matches

Example #11

Source File: us_stats.py From chicksexer with MIT License

6 votes

def compute_gender_probas(dir_path, start_year):
    year_prefix = 'yob'
    name2gender2count = defaultdict(lambda: defaultdict(int))
    for file_path in glob(os.path.join(dir_path, '*.txt')):
        year = int(regex.search(r'/{}(\d\d\d\d)'.format(year_prefix), file_path).groups()[0])
        if year < start_year:
            continue

        with open(file_path, encoding='utf8') as file_:
            csv_reader = csv.reader(file_)
            for name, gender, count in csv_reader:
                name2gender2count[name][_CLASS_MAP[gender]] += int(count)

    name2proba = dict()
    for name, gender2count in name2gender2count.items():
        name2proba[name] = float(gender2count[POSITIVE_CLASS]) / (gender2count[POSITIVE_CLASS] +
                                                                  gender2count[NEGATIVE_CLASS])
    return name2proba

Example #12

Source File: hosters.py From bioconda-utils with MIT License

6 votes

def get_versions(self, req, orig_version):
        exclude = set(self.exclude)
        vals = {key: val
                for key, val in self.vals.items()
                if key not in exclude}
        link_pattern = replace_named_capture_group(self.link_pattern_compiled, vals)
        link_re = re.compile(link_pattern)
        result = []
        for url in self.releases_urls:
            files = await req.get_ftp_listing(url)
            for fname in files:
                match = link_re.search(fname)
                if match:
                    data = match.groupdict()
                    data['fn'] = fname
                    data['link'] = "ftp://" + vals['host'] + fname
                    data['releases_url'] = url
                    result.append(data)
        return result

Example #13

Source File: hosters.py From bioconda-utils with MIT License

6 votes

def try_make_hoster(cls: Type["Hoster"], url: str,
                        config: Dict[str, str]) -> Optional["Hoster"]:
        """Creates hoster if **url** is matched by its **url_pattern**"""
        if config:
            try:
                klass: Type["Hoster"] = type(
                    "Customized" + cls.__name__,
                    (cls,),
                    {key+"_pattern":val for key, val in config.items()}
                )
            except KeyError:
                logger.debug("Overrides invalid for %s - skipping", cls.__name__)
                return None
        else:
            klass = cls
        match = klass.url_re.search(url)
        if match:
            return klass(url, match)
        return None

Example #14

Source File: _encoder.py From chicksexer with MIT License

6 votes

def encode(self, names):
        """
        Encode list of names into list of list of character IDs using the character encoder.

        :param names: list of names
        :return: list (each name) of list (each word) of character IDs
        """
        name_id2word_id2char_ids = list()
        for name in names:
            name = self._clean_characters(name)
            word_id2char_ids = list()

            for word in name.split(self._separator):
                word = '{}{}{}'.format(self._start_char, word, self._end_char)
                try:
                    word_id2char_ids.append(self._label_encoder.transform(list(word)).tolist())
                except ValueError as exception:
                    unseen_chars = regex.search(
                        r'y contains new labels: (.*)$', exception.args[0]).groups()[0]
                    raise UnseenCharacterException('Unseen characters: {}'.format(unseen_chars))

            name_id2word_id2char_ids.append(word_id2char_ids)

        return name_id2word_id2char_ids

Example #15

Source File: regex_test_helper.py From JusticeAI with MIT License

6 votes

def regex_finder(sentence):
    """
    This function is used to see if a regex is already written for a given sentence
    :param sentence: is used to find a regex the matches it
    :return: list of regex names that matches this sentence
    """
    regex_match_list = []

    for fact in RegexLib.regex_facts:
        for reg in fact[regex_index]:
            if re.search(reg, sentence):
                regex_match_list.append(fact[regex_name_index])
    for outcome in RegexLib.regex_outcomes:
        for reg in outcome[regex_index]:
            if re.search(reg, sentence):
                regex_match_list.append(outcome[regex_name_index])

    return regex_match_list

Example #16

Source File: extractors.py From chepy with GNU General Public License v3.0

6 votes

def extract_mac_address(self, is_binary: bool = False):
        """Extract MAC addresses

        Args:
            is_binary (bool, optional): The state is in binary format. It will then first 
                extract the strings from it before matching.
        
        Returns:
            Chepy: The Chepy object. 
        """
        pattern = b"^([0-9a-fA-F][0-9a-fA-F]:){5}([0-9a-fA-F][0-9a-fA-F])$"
        if is_binary:  # pragma: no cover
            matched = list(
                filter(lambda x: re.search(pattern, x), self.extract_strings().o)
            )
        else:
            matched = list(
                filter(
                    lambda x: re.search(pattern, x), self._convert_to_bytes().split()
                )
            )
        self.state = matched
        return self

Example #17

Source File: freshness_date_parser.py From dateparser with BSD 3-Clause "New" or "Revised" License

6 votes

def _parse_date(self, date_string, prefer_dates_from):
        if not self._are_all_words_units(date_string):
            return None, None

        kwargs = self.get_kwargs(date_string)
        if not kwargs:
            return None, None

        period = 'day'
        if 'days' not in kwargs:
            for k in ['weeks', 'months', 'years']:
                if k in kwargs:
                    period = k[:-1]
                    break

        td = relativedelta(**kwargs)
        if (
            re.search(r'\bin\b', date_string) or
            re.search(r'\bfuture\b', prefer_dates_from) and
            not re.search(r'\bago\b', date_string)
        ):
            date = self.now + td
        else:
            date = self.now - td
        return date, period

Example #18

Source File: extractors.py From chepy with GNU General Public License v3.0

6 votes

def extract_ips(self, invalid: bool = False, is_binary: bool = False):
        """Extract ipv4 and ipv6 addresses
        
        Args:
            invalid (bool, optional): Include :: addresses. Defaults to False.
            is_binary (bool, optional): The state is in binary format. It will then first 
                extract the strings from it before matching.
        
        Returns:
            Chepy: The Chepy object. 
        """
        pattern = b"((^\s*((([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))\s*$)|(^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$))"
        if is_binary:  # pragma: no cover
            matched = list(
                filter(lambda x: re.search(pattern, x), self.extract_strings().o)
            )
        else:
            matched = list(
                filter(
                    lambda x: re.search(pattern, x), self._convert_to_bytes().split()
                )
            )
        self.state = matched
        return self

Example #19

Source File: utils.py From chepy with GNU General Public License v3.0

6 votes

def find_replace(self, pattern: str, repl: str, ignore_case=True):
        """Replace matched pattern with repln
        
        Args:
            pattern (str): Required. Pattern to search
            repl (str): Required. Pattern to match
            ignore_case (bool, optional): Case insensitive. Defaults to True.
        
        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("some some data").find_replace(r"some\s", "data").o
            "datadatadata"
        """
        flags = 0
        if ignore_case:
            flags = re.IGNORECASE
        self.state = re.sub(pattern, repl, self._convert_to_str(), flags=flags)
        return self

Example #20

Source File: quotations.py From talon with Apache License 2.0

6 votes

def _mark_quoted_email_splitlines(markers, lines):
    """
    When there are headers indented with '>' characters, this method will
    attempt to identify if the header is a splitline header. If it is, then we
    mark it with 's' instead of leaving it as 'm' and return the new markers.
    """
    # Create a list of markers to easily alter specific characters
    markerlist = list(markers)
    for i, line in enumerate(lines):
        if markerlist[i] != 'm':
            continue
        for pattern in SPLITTER_PATTERNS:
            matcher = re.search(pattern, line)
            if matcher:
                markerlist[i] = 's'
                break

    return "".join(markerlist)

Example #21

Source File: utils.py From chepy with GNU General Public License v3.0

6 votes

def filter_dict_key(self, by: str):
        """Filter dictionary by key
        
        Args:
            by (str): Required. Key to filter by. 
        
        Returns:
            Chepy: The Chepy object. 

        Examples:
            >>> Chepy({'some': 'dict', 'another': 'val'}).filter_dict_key('ano')
            {'another': 'val'}
        """
        assert isinstance(self.state, dict), StateNotDict()
        self.state = {
            key: val for (key, val) in self.state.items() if re.search(by, str(key))
        }
        return self

Example #22

Source File: hyper_param_search_uq.py From deep-quant with MIT License

6 votes

def get_search_configs():
    """
    Defines the configurations for hyper parameter search
    """
    configurations.DEFINE_string("template",None,"Template file for hyper-param search")
    configurations.DEFINE_string("search_algorithm","genetic","Algorithm for hyper-param optimization. Select from 'genetic', 'grid_search'")
    configurations.DEFINE_integer("generations",100,"Number of generations for genetic algorithm")
    configurations.DEFINE_integer("pop_size",20,"Population size for genetic algorithm")
    configurations.DEFINE_integer("num_survivors",10,"Number of survivors for genetic algorithm")
    configurations.DEFINE_integer("num_threads",4,"NUmber of parallel threads (Number of parallel executions)")
    configurations.DEFINE_integer("num_gpu",1,"Number of GPU on the machine, Use 0 if there are None")
    configurations.DEFINE_integer("sleep_time",1,"Sleep time")
    configurations.DEFINE_float("mutate_rate",0.2,"Mutation rate for genetic algorithm")
    configurations.DEFINE_string("init_pop",None,"Specify starting population. Path to the pickle file")

    c = configurations.ConfigValues()

    return c

Example #23

Source File: __init__.py From dateparser with BSD 3-Clause "New" or "Revised" License

6 votes

def localize_timezone(date_time, tz_string):
    if date_time.tzinfo:
        return date_time

    tz = None

    try:
        tz = timezone(tz_string)
    except UnknownTimeZoneError as e:
        for name, info in _tz_offsets:
            if info['regex'].search(' %s' % tz_string):
                tz = StaticTzInfo(name, info['offset'])
                break
        else:
            raise e

    return tz.localize(date_time)

Example #24

Source File: hyper_param_search.py From deep-quant with MIT License

6 votes

def get_search_configs():
    """
    Defines the configurations for hyper parameter search
    """
    configurations.DEFINE_string("template",None,"Template file for hyper-param search")
    configurations.DEFINE_string("search_algorithm","genetic","Algorithm for hyper-param optimization. Select from 'genetic', 'grid_search'")
    configurations.DEFINE_integer("generations",100,"Number of generations for genetic algorithm")
    configurations.DEFINE_integer("pop_size",20,"Population size for genetic algorithm")
    configurations.DEFINE_integer("num_survivors",10,"Number of survivors for genetic algorithm")
    configurations.DEFINE_integer("num_threads",4,"NUmber of parallel threads (Number of parallel executions)")
    configurations.DEFINE_integer("num_gpu",1,"Number of GPU on the machine, Use 0 if there are None")
    configurations.DEFINE_integer("sleep_time",1,"Sleep time")
    configurations.DEFINE_float("mutate_rate",0.02,"Mutation rate for genetic algorithm")
    configurations.DEFINE_string("init_pop",None,"Specify starting population. Path to the pickle file")

    c = configurations.ConfigValues()

    return c

Example #25

Source File: hyper_param_search.py From deep-quant with MIT License

6 votes

def generate_results(pop,gen):
    result = list()
    for i in range(len(pop)):
        filename = output_filename(gen,i)
        print("Reading file "+filename)
        with open(filename) as f:
            content = f.readlines()
        content = [x.strip() for x in content]
        # remove lines w/o error
        content = [s for s in content if re.search('MSE',s)]
        errors = [float(s.split()[_VALID_ERR_IDX]) for s in content]
        if len(errors) > 0:
            errors.sort()
            result.append(errors[0])
        else:
            result.append(float('inf'))
        if result[-1] == 'nan':
            result[-1] = float('inf')

    print("-"*80)
    print(result)
    assert(len(pop) == len(result))
    return result

Example #26

Source File: utils.py From chepy with GNU General Public License v3.0

6 votes

def count_occurances(self, regex: str, case_sensitive: bool = False):
        """Counts occurances of the regex.

        Counts the number of times the provided string occurs.

        Args:
            regex (str): Required. Regex string to search for
            case_sensitive (bool, optional): If search should be case insensitive, by default False

        Returns:
            Chepy: The Chepy object.

        Examples:
            >>> Chepy("AABCDADJAKDJHKSDAJSDdaskjdhaskdjhasdkja").count_occurances("ja").output
            2
        """
        if case_sensitive:
            r = re.compile(regex)
        else:
            r = re.compile(regex, re.IGNORECASE)
        self.state = len(r.findall(self._convert_to_str()))
        return self

Example #27

Source File: order_languages.py From dateparser with BSD 3-Clause "New" or "Revised" License

6 votes

def _get_language_locale_dict():
    cldr_dates_full_dir = "../raw_data/cldr_dates_full/main/"
    available_locale_names = os.listdir(cldr_dates_full_dir)
    available_language_names = [shortname for shortname in available_locale_names
                                if not re.search(r'-[A-Z0-9]+$', shortname)]
    available_language_names.remove('root')
    language_locale_dict = {}
    for language_name in available_language_names:
        language_locale_dict[language_name] = []
        for locale_name in available_locale_names:
            if re.match(language_name + '-[A-Z0-9]+$', locale_name):
                language_locale_dict[language_name].append(locale_name)

    for language in avoid_languages:
        del language_locale_dict[language]
    return language_locale_dict

Example #28

Source File: functional_load.py From CorpusTools with BSD 3-Clause "New" or "Revised" License

6 votes

def fits_environment(w1, w2, index, environment_filter):
    """
    Return True iff for both w1 and w2 (tiers), the environment
    of its i'th element passes the environment_filter.

    """
    if not environment_filter:
        return True

    def ready_for_re(word, index):
        w = [str(seg) for seg in word]
        w[index] = '_'
        return ' '.join(w)

    w1 = ready_for_re(w1, index)
    w2 = ready_for_re(w2, index)
    for env in environment_filter:
        env_re = make_environment_re(env)
        if not (bool(re.search(env_re, w1)) and bool(re.search(env_re, w2))):
            return False
    return True

Example #29

Source File: rules.py From epitran with MIT License

6 votes

def _read_rule(self, i, line):
        line = line.strip()
        if line:
            line = unicodedata.normalize('NFD', line)
            s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line)
            if s:
                self.symbols[s.group('symbol')] = s.group('value')
            else:
                line = self._sub_symbols(line)
                r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line)
                try:
                    a, b, X, Y = r.groups()
                except AttributeError:
                    raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line))
                X, Y = X.replace('#', '^'), Y.replace('#', '$')
                a, b = a.replace('0', ''), b.replace('0', '')
                try:
                    if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a):
                        return self._fields_to_function_metathesis(a, X, Y)
                    else:
                        return self._fields_to_function(a, b, X, Y)
                except Exception as e:
                    raise DatafileError('Line {}: "{}" cannot be compiled as regex: ̪{}'.format(i + 1, line, e))

Example #30

Source File: __main__.py From chepy with GNU General Public License v3.0

6 votes

def validate(self, document):
        text = document.text.split()
        if re.search(r"^(!|#|\?)", document.text):
            pass
        elif len(text) > 1:
            if not text[-2].startswith("--"):
                if (
                    not re.search(r"\"|'", text[-1])
                    and not text[-1].startswith("--")
                    and text[-1] not in list(get_options().keys())
                ):
                    raise ValidationError(
                        cursor_position=1,
                        message="{text} is not a valid Chepy method".format(
                            text=text[-1]
                        ),
                    )