python source code of __init_

# -*- coding: utf-8 -*-

"""
The `find_job_titles` library finds mentions of job titles in strings.

In order to do so it compiles a search datastructure (Aho Corasick) and uses
    a precompiled list of >70k job titles as a reference list.

It returns the longest matching job title, including cross-overlapping matches,
    together with the start and end position in the given string.

TODO:
* also compare to https://github.com/scrapinghub/webstruct/blob/master/webstruct/utils.py#L155
"""

__author__ = 'Johannes Ahlmann'
__email__ = 'johannes@fluquid.com'
__version__ = '0.7.1'

import gzip
from pkg_resources import resource_stream
import logging
from collections import namedtuple

from acora import AcoraBuilder
import ahocorasick


Match = namedtuple('Match', ['start', 'end', 'match'])


def load_titles():
    """
    load job titles as generator from txt.gz file included in the library
    """
    with resource_stream('find_job_titles',
                         'data/titles_combined.txt.gz') as fhandle:
        with gzip.GzipFile(fileobj=fhandle, mode='r') as gzf:
            for line in gzf:
                # Note: decode rather than "rt" for py2 compat
                # TODO: using pyahocorasick this should now be 'str' again ;(
                yield line.decode('utf-8').strip()


def longest_match(matches):
    """
    find respective longest matches from all overlapping aho corasick matches
    """
    try:
        longest = next(matches)
        if longest is None:
            return
    except StopIteration:
        return

    for elt in matches:
        # if (a contains b) or (b contains a)
        if (elt.start >= longest.start and elt.end <= longest.end) or \
           (longest.start >= elt.start and longest.end <= elt.end):
            longest = max(longest, elt, key=lambda x: x.end - x.start)
        else:
            yield longest
            longest = elt
    yield longest


def add_start(matches):
    """
    convert acora `(match, start)` tuples into `Match(start, end, match)` format
    """
    return (Match(start=start, end=start + len(match), match=match)
            for match, start in matches)


class BaseFinder(object):
    """
    Base class containing query methods
    """

    def findall(self, string, use_longest=True):
        """
        utility function returning `list` of results from `finditer`
        :param string: string to search target patterns in
        :param use_longest: if True only return longest matches,
                            else return all overlapping matches
        :returns: list of matches of type `Match`
        """
        return list(self.finditer(string, use_longest=use_longest))

    def finditer(self, string, use_longest=True):
        """
        iterator of all (longest) matches of target patterns in `string`
        :param string: string to search target patterns in
        :param use_longest: if True only return longest matches,
                            else return all overlapping matches
        :returns: generator of matches of type `Match`
        """
        if use_longest:
            return longest_match(self.find_raw(string))
        else:
            return self.find_raw(string)


class FinderAcora(BaseFinder):
    """
    Finder class based on "acora" library.

    Note: Building data structure seems to be significantly slower than with
          pyahocorasick
    """

    def __init__(self, use_unicode=True, ignore_case=False, titles=None, extra_titles=None):
        """
        :param use_unicode: whether to use `titles` as unicode or bytestrings
        :param ignore_case: if True ignore case in all matches
        :param titles: if given, overrides default `load_titles()` values
        :param extra_titles: if given, add to titles
        """
        titles = titles if titles else load_titles()
        titles = (titles
                  if use_unicode
                  else (s.encode('ascii') for s in titles))
        builder = AcoraBuilder()
        logging.info('building job title searcher')
        builder.update(titles)
        if extra_titles:
            builder.add(extra_titles)

        self.ac = builder.build(ignore_case=ignore_case)
        logging.info('building done')

    def find_raw(self, string):
        """
        generator of raw, overlapping matches of all lengths from automaton
        """
        return add_start(self.ac.finditer(string))


class FinderPyaho(BaseFinder):
    """
    Finder class based on "pyahocorasick" library.

    TODO:
    - use pickle and unpickle support for `self.autom`
    """

    def __init__(self, ignore_case=True, titles=None, extra_titles=None):
        """
        :param ignore_case if True, lower case job titles are also added
        :param titles: if given, overrides default `load_titles()` values
        :param extra_titles: if given, add to titles
        """
        titles = titles if titles else load_titles()
        logging.info('building job title searcher')
        autom = ahocorasick.Automaton()
        for title in titles:
            autom.add_word(title, title)
            if ignore_case:
                autom.add_word(title.lower(), title.lower())

        if extra_titles:
            for title in extra_titles:
                autom.add_word(title, title)
                if ignore_case:
                    autom.add_word(title.lower(), title.lower())

        autom.make_automaton()
        self.autom = autom
        logging.info('building done')

    def find_raw(self, string):
        """
        generator of raw, overlapping matches of all lengths from automaton
        """
        for end, match in self.autom.iter(string):
            start = end - len(match) + 1
            yield Match(start=start, end=end, match=match)


Finder = FinderPyaho