Python re.UNICODE Examples

The following are 30 code examples for showing how to use re.UNICODE(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module re , or try the search function .

Example 1
Project: recipes-py   Author: luci   File: magic_check_fn.py    License: Apache License 2.0 8 votes vote down vote up
def render_re(regex):
  """Renders a repr()-style value for a compiled regular expression."""
  actual_flags = []
  if regex.flags:
    flags = [
      (re.IGNORECASE, 'IGNORECASE'),
      (re.LOCALE, 'LOCALE'),
      (re.UNICODE, 'UNICODE'),
      (re.MULTILINE, 'MULTILINE'),
      (re.DOTALL, 'DOTALL'),
      (re.VERBOSE, 'VERBOSE'),
    ]
    for val, name in flags:
      if regex.flags & val:
        actual_flags.append(name)
  if actual_flags:
    return 're.compile(%r, %s)' % (regex.pattern, '|'.join(actual_flags))
  else:
    return 're.compile(%r)' % regex.pattern 
Example 2
Project: grimoirelab-sortinghat   Author: chaoss   File: gitdm.py    License: GNU General Public License v3.0 6 votes vote down vote up
def __parse_domain_to_employer_line(self, raw_domain, raw_org):
        """Parse domain to employer lines"""

        d = re.match(self.DOMAIN_REGEX, raw_domain, re.UNICODE)
        if not d:
            cause = "invalid domain format: '%s'" % raw_domain
            raise InvalidFormatError(cause=cause)

        dom = d.group('domain').strip()

        o = re.match(self.ORGANIZATION_REGEX, raw_org, re.UNICODE)
        if not o:
            cause = "invalid organization format: '%s'" % raw_org
            raise InvalidFormatError(cause=cause)

        org = o.group('organization').strip()

        org = self.__encode(org)
        dom = self.__encode(dom)

        return org, dom 
Example 3
Project: razzy-spinner   Author: rafasashi   File: marshal.py    License: GNU General Public License v3.0 6 votes vote down vote up
def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
       
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
        for line in handler.readlines():
            m = re.match(pattern, line)
            text, tag = m.groups()
            self._model[text] = tag
        
        handler.close() 
Example 4
Project: razzy-spinner   Author: rafasashi   File: marshal.py    License: GNU General Public License v3.0 6 votes vote down vote up
def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
        
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        lines = handler.readlines()
        # will fail if "length " and "minlength " are not present
        self._length = int(lines[0].split("length ")[1])
        self._minlength = int(lines[1].split("minlength ")[1])
        
        pattern = re.compile(r'^(.+):(.+?)$', re.UNICODE)
        for line in lines[2:]:
            m = re.match(pattern, line)
            text, tag = m.groups()
            self._model[text] = tag
        
        handler.close() 
Example 5
Project: recruit   Author: Frank-qlu   File: regex.py    License: Apache License 2.0 6 votes vote down vote up
def str_flags_to_int(str_flags):
    flags = 0
    if "i" in str_flags:
        flags |= re.IGNORECASE
    if "l" in str_flags:
        flags |= re.LOCALE
    if "m" in str_flags:
        flags |= re.MULTILINE
    if "s" in str_flags:
        flags |= re.DOTALL
    if "u" in str_flags:
        flags |= re.UNICODE
    if "x" in str_flags:
        flags |= re.VERBOSE

    return flags 
Example 6
Project: jbox   Author: jpush   File: base.py    License: MIT License 6 votes vote down vote up
def __init__(self, **kwargs):
        """Construct a TINYTEXT.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(TINYTEXT, self).__init__(**kwargs) 
Example 7
Project: jbox   Author: jpush   File: base.py    License: MIT License 6 votes vote down vote up
def __init__(self, **kwargs):
        """Construct a MEDIUMTEXT.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(MEDIUMTEXT, self).__init__(**kwargs) 
Example 8
Project: jbox   Author: jpush   File: base.py    License: MIT License 6 votes vote down vote up
def __init__(self, **kwargs):
        """Construct a LONGTEXT.

        :param charset: Optional, a column-level character set for this string
          value.  Takes precedence to 'ascii' or 'unicode' short-hand.

        :param collation: Optional, a column-level collation for this string
          value.  Takes precedence to 'binary' short-hand.

        :param ascii: Defaults to False: short-hand for the ``latin1``
          character set, generates ASCII in schema.

        :param unicode: Defaults to False: short-hand for the ``ucs2``
          character set, generates UNICODE in schema.

        :param national: Optional. If true, use the server's configured
          national character set.

        :param binary: Defaults to False: short-hand, pick the binary
          collation type that matches the column's character set.  Generates
          BINARY in schema.  This does not affect the type of data stored,
          only the collation of character data.

        """
        super(LONGTEXT, self).__init__(**kwargs) 
Example 9
Project: bugatsinho.github.io   Author: bugatsinho   File: _stdlib.py    License: GNU General Public License v3.0 6 votes vote down vote up
def expandvars(path):
    """
    Args:
        path (pathlike): A path to expand
    Returns:
        `fsnative`

    Like :func:`python:os.path.expandvars` but supports unicode under Windows
    + Python 2 and always returns a `fsnative`.
    """

    path = path2fsn(path)

    def repl_func(match):
        return environ.get(match.group(1), match.group(0))

    path = re.compile(r"\$(\w+)", flags=re.UNICODE).sub(repl_func, path)
    if os.name == "nt":
        path = re.sub(r"%([^%]+)%", repl_func, path)
    return re.sub(r"\$\{([^\}]+)\}", repl_func, path) 
Example 10
Project: pyhcl   Author: virtuald   File: lexer.py    License: Mozilla Public License 2.0 6 votes vote down vote up
def __init__(self, export_comments=None):
        if export_comments is not None:
            if export_comments == 'LINE':
                self.can_export_comments = ['COMMENT']
            elif export_comments == 'MULTILINE':
                self.can_export_comments = ['MULTICOMMENT']
            elif export_comments == 'ALL':
                self.can_export_comments = ['COMMENT', 'MULTICOMMENT']
            else:
                raise ValueError(
                    'Only `LINE`, `MULTILINE` and `ALL` value are allowed for '
                    '`export_comments`. given: `%s`.' % export_comments
                )

        self.lex = lex.lex(
            module=self,
            debug=False,
            reflags=(re.UNICODE | re.MULTILINE),
            errorlog=lex.NullLogger(),
        ) 
Example 11
Project: GTDWeb   Author: lanbing510   File: urlresolvers.py    License: GNU General Public License v2.0 6 votes vote down vote up
def regex(self):
        """
        Returns a compiled regular expression, depending upon the activated
        language-code.
        """
        language_code = get_language()
        if language_code not in self._regex_dict:
            if isinstance(self._regex, six.string_types):
                regex = self._regex
            else:
                regex = force_text(self._regex)
            try:
                compiled_regex = re.compile(regex, re.UNICODE)
            except re.error as e:
                raise ImproperlyConfigured(
                    '"%s" is not a valid regular expression: %s' %
                    (regex, six.text_type(e)))

            self._regex_dict[language_code] = compiled_regex
        return self._regex_dict[language_code] 
Example 12
Project: lambda-packs   Author: ryfeus   File: inlinepatterns.py    License: MIT License 6 votes vote down vote up
def __init__(self, pattern, markdown_instance=None):
        """
        Create an instant of an inline pattern.

        Keyword arguments:

        * pattern: A regular expression that matches a pattern

        """
        self.pattern = pattern
        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, 
                                      re.DOTALL | re.UNICODE)

        # Api for Markdown to pass safe_mode into instance
        self.safe_mode = False
        if markdown_instance:
            self.markdown = markdown_instance 
Example 13
Project: ironpython2   Author: IronLanguages   File: test_re.py    License: Apache License 2.0 6 votes vote down vote up
def test_bug_6561(self):
        # '\d' should match characters in Unicode category 'Nd'
        # (Number, Decimal Digit), but not those in 'Nl' (Number,
        # Letter) or 'No' (Number, Other).
        decimal_digits = [
            unichr(0x0037), # '\N{DIGIT SEVEN}', category 'Nd'
            unichr(0x0e58), # '\N{THAI DIGIT SIX}', category 'Nd'
            unichr(0xff10), # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
            ]
        for x in decimal_digits:
            self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)

        not_decimal_digits = [
            unichr(0x2165), # '\N{ROMAN NUMERAL SIX}', category 'Nl'
            unichr(0x3039), # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
            unichr(0x2082), # '\N{SUBSCRIPT TWO}', category 'No'
            unichr(0x32b4), # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
            ]
        for x in not_decimal_digits:
            self.assertIsNone(re.match('^\d$', x, re.UNICODE)) 
Example 14
Project: fishroom   Author: tuna   File: base.py    License: GNU General Public License v3.0 5 votes vote down vote up
def match_nickname_content(self, content: str) -> Tuple[str, str]:
        m = re.match(
            r'^\[(?P<nick>.+?)\] (?P<content>.*)',
            content, flags=re.UNICODE
        )
        return (m.group('nick'), m.group('content')) if m else (None, None) 
Example 15
Project: fishroom   Author: tuna   File: telegram.py    License: GNU General Public License v3.0 5 votes vote down vote up
def try_set_nick(self, msg):
        # handle command
        user_id = msg.user.id
        target = "%d" % msg.chat_id
        try:
            tmp = msg.content.split()
            cmd = tmp[0][1:].lower()
            args = tmp[1:]
        except:
            return

        if cmd == "nick":
            if len(args) == 1:
                nick = args[0]
                if not re.match(r'^\w', nick, flags=re.UNICODE):
                    self.send_msg(target, "Use a human's nick name, please.")
                    return True
                self.nick_store.set_nickname(user_id, nick)
                content = "Changed nickname to '%s'" % nick
                logger.debug(target, content)
                self.send_msg(target, content)
            else:
                self.send_msg(
                    target,
                    "Invalid Command, use '/nick nickname'"
                    "to change nickname."
                )
            return True 
Example 16
Project: grimoirelab-sortinghat   Author: chaoss   File: mailmap.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __parse_stream(self, stream):
        """Generic method to parse mailmap streams"""

        nline = 0
        lines = stream.split('\n')

        for line in lines:
            nline += 1

            # Ignore blank lines and comments
            m = re.match(self.LINES_TO_IGNORE_REGEX, line, re.UNICODE)
            if m:
                continue

            line = line.strip('\n').strip(' ')
            parts = line.split('>')

            if len(parts) == 0:
                cause = "line %s: invalid format" % str(nline)
                raise InvalidFormatError(cause=cause)

            aliases = []

            for part in parts:
                part = part.replace(',', ' ')
                part = part.strip('\n').strip(' ')

                if len(part) == 0:
                    continue

                if part.find('<') < 0:
                    cause = "line %s: invalid format" % str(nline)
                    raise InvalidFormatError(cause=cause)

                alias = email.utils.parseaddr(part + '>')
                aliases.append(alias)

            yield aliases 
Example 17
Project: grimoirelab-sortinghat   Author: chaoss   File: grimoirelab.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __validate_email(self, email):
        """Checks if a string looks like an email address"""

        e = re.match(self.EMAIL_ADDRESS_REGEX, email, re.UNICODE)
        if e:
            return email
        else:
            error = "Invalid email address: " + str(email)
            msg = self.GRIMOIRELAB_INVALID_FORMAT % {'error': error}
            raise InvalidFormatError(cause=msg) 
Example 18
Project: grimoirelab-sortinghat   Author: chaoss   File: gitdm.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __parse_stream(self, stream, parse_line):
        """Generic method to parse gitdm streams"""

        if not stream:
            raise InvalidFormatError(cause='stream cannot be empty or None')

        nline = 0
        lines = stream.split('\n')

        for line in lines:
            nline += 1

            # Ignore blank lines and comments
            m = re.match(self.LINES_TO_IGNORE_REGEX, line, re.UNICODE)
            if m:
                continue

            m = re.match(self.VALID_LINE_REGEX, line, re.UNICODE)
            if not m:
                cause = "line %s: invalid format" % str(nline)
                raise InvalidFormatError(cause=cause)

            try:
                result = parse_line(m.group(1), m.group(2))
                yield result
            except InvalidFormatError as e:
                cause = "line %s: %s" % (str(nline), e)
                raise InvalidFormatError(cause=cause) 
Example 19
Project: grimoirelab-sortinghat   Author: chaoss   File: gitdm.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __parse_email_to_employer_line(self, raw_email, raw_enrollment):
        """Parse email to employer lines"""

        e = re.match(self.EMAIL_ADDRESS_REGEX, raw_email, re.UNICODE)
        if not e and self.email_validation:
            cause = "invalid email format: '%s'" % raw_email
            raise InvalidFormatError(cause=cause)

        if self.email_validation:
            email = e.group('email').strip()
        else:
            email = raw_email

        r = re.match(self.ENROLLMENT_REGEX, raw_enrollment, re.UNICODE)
        if not r:
            cause = "invalid enrollment format: '%s'" % raw_enrollment
            raise InvalidFormatError(cause=cause)

        org = r.group('organization').strip()
        date = r.group('date')

        if date:
            try:
                dt = dateutil.parser.parse(r.group('date'))
            except Exception as e:
                cause = "invalid date: '%s'" % date
        else:
            dt = MAX_PERIOD_DATE

        email = self.__encode(email)
        org = self.__encode(org)

        return email, org, dt 
Example 20
def test_lines_to_ignore(self):
        """Check whether it parsers blank or comment lines"""

        parser = re.compile(GitdmParser.LINES_TO_IGNORE_REGEX, re.UNICODE)

        # Parse some valid blank lines
        m = parser.match("")
        self.assertIsNotNone(m)

        m = parser.match("\n\n\n")
        self.assertIsNotNone(m)

        m = parser.match("      \t    \r\n ")
        self.assertIsNotNone(m)

        m = parser.match("\t\t  \n  \t\n")
        self.assertIsNotNone(m)

        # Do not parse invalid blank lines
        m = parser.match("\ndomain organization\n\n")
        self.assertIsNone(m)

        m = parser.match(" domain \t organization  \r\n ")
        self.assertIsNone(m)

        m = parser.match("\t   domain organization\t  \n  \n")
        self.assertIsNone(m)

        # Parse some valid comments
        m = parser.match("#    \t\n\r")
        self.assertIsNotNone(m)

        m = parser.match("#|tcomment #1\r\n")
        self.assertIsNotNone(m) 
Example 21
def test_email(self):
        """Check email address pattern"""

        parser = re.compile(GitdmParser.EMAIL_ADDRESS_REGEX, re.UNICODE)

        # Parse some valid email addresses
        m = parser.match("johndoe@example.com")
        self.assertIsNotNone(m)

        m = parser.match("jonh.doe@exampel.com")
        self.assertIsNotNone(m)

        m = parser.match("?¡~,123@example.com")
        self.assertIsNotNone(m)

        # Do not parse invalid email addresses
        m = parser.match("jonh@doe@example.com")
        self.assertIsNone(m)

        m = parser.match("   johndoe@example.com")
        self.assertIsNone(m)

        m = parser.match("johndoe@example.com  ")
        self.assertIsNone(m)

        m = parser.match("johndoe@example.com\t")
        self.assertIsNone(m)

        m = parser.match("johndoe@.com")
        self.assertIsNone(m) 
Example 22
def test_domain(self):
        """Check domain pattern"""

        parser = re.compile(GitdmParser.DOMAIN_REGEX, re.UNICODE)

        # Domains must start with alpha numeric or underscore
        # characters.

        # These must work
        m = parser.match("__example.org")
        self.assertIsNotNone(m)

        m = parser.match("9example.org")
        self.assertIsNotNone(m)

        # While these won't work
        m = parser.match("'_example.org")
        self.assertIsNone(m)

        m = parser.match("/example.org")
        self.assertIsNone(m)

        m = parser.match("exa\tmple.org")
        self.assertIsNone(m)

        m = parser.match(" example.org")
        self.assertIsNone(m) 
Example 23
Project: jawfish   Author: war-and-code   File: test_re.py    License: MIT License 5 votes vote down vote up
def test_special_escapes(self):
        self.assertEqual(re.search(r"\b(b.)\b",
                                   "abcd abc bcd bx").group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   "abc bcd bc abxd").group(1), "bx")
        self.assertEqual(re.search(r"\b(b.)\b",
                                   "abcd abc bcd bx", re.LOCALE).group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   "abc bcd bc abxd", re.LOCALE).group(1), "bx")
        self.assertEqual(re.search(r"\b(b.)\b",
                                   "abcd abc bcd bx", re.UNICODE).group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   "abc bcd bc abxd", re.UNICODE).group(1), "bx")
        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
        self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
        self.assertEqual(re.search(r"\b(b.)\b",
                                   "abcd abc bcd bx").group(1), "bx")
        self.assertEqual(re.search(r"\B(b.)\B",
                                   "abc bcd bc abxd").group(1), "bx")
        self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
        self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
        self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
        self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                   "1aa! a").group(0), "1aa! a")
        self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                   "1aa! a", re.LOCALE).group(0), "1aa! a")
        self.assertEqual(re.search(r"\d\D\w\W\s\S",
                                   "1aa! a", re.UNICODE).group(0), "1aa! a") 
Example 24
Project: jawfish   Author: war-and-code   File: test_re.py    License: MIT License 5 votes vote down vote up
def test_bigcharset(self):
        self.assertEqual(re.match("([\u2222\u2223])",
                                  "\u2222").group(1), "\u2222")
        self.assertEqual(re.match("([\u2222\u2223])",
                                  "\u2222", re.UNICODE).group(1), "\u2222") 
Example 25
Project: jawfish   Author: war-and-code   File: test_re.py    License: MIT License 5 votes vote down vote up
def test_getlower(self):
        import _sre
        self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
        self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
        self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))

        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 
Example 26
Project: jawfish   Author: war-and-code   File: test_re.py    License: MIT License 5 votes vote down vote up
def test_ascii_and_unicode_flag(self):
        # String patterns
        for flags in (0, re.UNICODE):
            pat = re.compile('\xc0', flags | re.IGNORECASE)
            self.assertNotEqual(pat.match('\xe0'), None)
            pat = re.compile('\w', flags)
            self.assertNotEqual(pat.match('\xe0'), None)
        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
        self.assertEqual(pat.match('\xe0'), None)
        pat = re.compile('(?a)\xc0', re.IGNORECASE)
        self.assertEqual(pat.match('\xe0'), None)
        pat = re.compile('\w', re.ASCII)
        self.assertEqual(pat.match('\xe0'), None)
        pat = re.compile('(?a)\w')
        self.assertEqual(pat.match('\xe0'), None)
        # Bytes patterns
        for flags in (0, re.ASCII):
            pat = re.compile(b'\xc0', re.IGNORECASE)
            self.assertEqual(pat.match(b'\xe0'), None)
            pat = re.compile(b'\w')
            self.assertEqual(pat.match(b'\xe0'), None)
        # Incompatibilities
        self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
        self.assertRaises(ValueError, re.compile, b'(?u)\w')
        self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
        self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
        self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
        self.assertRaises(ValueError, re.compile, '(?au)\w') 
Example 27
Project: jawfish   Author: war-and-code   File: tokenize.py    License: MIT License 5 votes vote down vote up
def _compile(expr):
    return re.compile(expr, re.UNICODE) 
Example 28
Project: razzy-spinner   Author: rafasashi   File: regexp.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _compile(regexp):

    parsed = sre_parse.parse(regexp)
    parsed = _remove_group_identifiers(parsed)

    # Add grouping parentheses around the regexp; this will allow
    # us to access the material that was split on.
    # Need to set the Pattern to expect a single group

    pattern = sre_parse.Pattern()
    pattern.groups += 1
    grouped = sre_parse.SubPattern(pattern)
    grouped.append((sre_constants.SUBPATTERN, (1, parsed)))

    return sre_compile.compile(grouped, re.UNICODE | re.MULTILINE | re.DOTALL) 
Example 29
Project: razzy-spinner   Author: rafasashi   File: marshal.py    License: GNU General Public License v3.0 5 votes vote down vote up
def unmarshal (self, filename):
        """
        Unmarshals (loads from a plain text file) the tagger model. For
        safety, this operation is intended to be performed only on
        newly created taggers (i.e., without any previous model).
        
        @param filename: Name of the file from which the model will
                         be read.
        @type filename: C{string}
        """
        handler = file(filename, "r")
        
        lines = handler.readlines()
        # will fail if "n " is not present
        self._n = int(lines[0].split("n ")[1])
        
        
        pattern = re.compile(r'^\[(.+)\]:(.+):(.+?)$', re.UNICODE)
        
        # As the separator-char ":" can be used as a tag or as a text,
        # 'context_pattern' is built based on the context's size (self._n),
        # for example:
        #   self._n = 2 -> r'^(.+?)$', like 'tag1'
        #   self._n = 3 -> r'^(.+?):(.+?)$', like 'tag1:tag2'
        #   self._n = 4 -> r'^(.+?):(.+?):(.+?)$', like 'tag1:tag2:tag3'
        context_pattern_str = r'^(.+?)%s$' % ( r':(.+?)' * (self._n-2) )
        context_pattern = re.compile(context_pattern_str, re.UNICODE)
        
        for line in lines[1:]:
            m = re.match(pattern, line)
            context, text, tag = m.groups()
            
            c_m = re.match(context_pattern, context)
            key = (c_m.groups(), text)
            self._model[key] = tag
        
        handler.close() 
Example 30
Project: razzy-spinner   Author: rafasashi   File: regexp.py    License: GNU General Public License v3.0 5 votes vote down vote up
def regexp_tokenize(text, pattern, gaps=False, discard_empty=True,
                    flags=re.UNICODE | re.MULTILINE | re.DOTALL):
    """
    Return a tokenized copy of *text*.  See :class:`.RegexpTokenizer`
    for descriptions of the arguments.
    """
    tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
    return tokenizer.tokenize(text)