python source code of utils

#!/usr/bin/env python
from __future__ import absolute_import
from __future__ import print_function
import re
from collections import defaultdict
from itertools import compress
import unicodedata

space_punct_re = re.compile(r"[\s\(\)\[\]\.\-\/\,]+")

NUMBERS = {
    'zero': 0,
    'half': 1.0 / 2.0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10,
    'eleven': 11,
    'twelve': 12,
    'thirteen': 13,
    'fourteen': 14,
    'fifteen': 15,
    'sixteen': 16,
    'seventeen': 17,
    'eighteen': 18,
    'nineteen': 19,
    'twenty': 20,
    'thirty': 30,
    'forty': 40,
    'fifty': 50,
    'sixty': 60,
    'seventy': 70,
    'eighty': 80,
    'ninety': 90,
}
ORDERS = {
    'hundred': 100,
    'thousand': 1000,
    'million': 1000000,
    'billion': 1000000000,
    'trillion': 1000000000000,
    'gillion': 1000000000,
}


def parse_number(num, default=None):
    try:
        return int(num)
    except ValueError:
        try:
            return float(num)
        except ValueError:
            return default


def parse_spelled_number(num_str):
    """Parse spelled out whole numbers."""
    tokens = []
    for t in num_str.strip().split(' '):
        if len(t) > 0:
            tokens.extend(t.split('-'))
    punctuation = re.compile(r'[\,\(\)]')
    affix = re.compile(r'(\d+)(st|nd|rd|th)')
    cleaned_tokens = []
    for t in tokens:
        if t == 'and':
            continue
        t = punctuation.sub('', t)
        t = affix.sub(r'\1', t)
        cleaned_tokens.append(t.lower())
    if len(cleaned_tokens) == 0:
        return None
    totals = [0]
    numeric_tokens_found = False
    prev_digits = False
    for t in cleaned_tokens:
        number = parse_number(t)
        if number is not None:
            if prev_digits:
                totals[-1] = totals[-1] * 1000 + number
            else:
                totals[-1] += number
        elif t in NUMBERS:
            # Ex: twenty one
            totals[-1] += NUMBERS[t]
        elif t in ORDERS:
            # if order is greater than previous order it should be combined.
            # Ex: five hundrend three thousand
            if len(totals) > 1 and ORDERS[t] > totals[-2]:
                totals[-2] = sum(totals[-2:]) * ORDERS[t]
                totals[-1] = 0
            else:
                totals[-1] *= ORDERS[t]
                totals.append(0)
        else:
            # Sometimes spacy number entities will include words like
            # about, or more than. This allows the initial tokens to be
            # skipped if they can't be parsed as numbers.
            if numeric_tokens_found:
                return None
            else:
                continue
        prev_digits = number is not None
        numeric_tokens_found = True
    if numeric_tokens_found:
        return sum(totals)


def parse_count_text(count_text, verbose=False):
    # verboseprint = print if verbose else lambda *a, **k: None
    if count_text[0] == '0' and len(count_text) > 1:
        return
    try:
        count = int(count_text)
    except ValueError:
        pass  # Try to parse it as a float
    try:
        count = parse_spelled_number(count_text)
    except ValueError:
        pass
    if count is None or int(count) != count:
        return
    else:
        return(count)


def batched(iterable, batch_size=100):
    """
    Sequentially yield segments of the iterable in lists of the given size.
    """
    batch = []
    for idx, item in enumerate(iterable):
        batch.append(item)
        batch_idx = idx % batch_size
        if batch_idx == batch_size - 1:
            yield batch
            batch = []
    yield batch


def flatten(l, unique=False, simplify=False):
    """
    Flatten an arbitrarily deep list or set to a depth-one list.

    simplify -- If the function would return a length-one list, instead
    returns the contents of that list (default False).

    unique -- Removes duplicate values (default False).
    """
    out = []
    for item in l:
        if isinstance(item, (list, tuple)):
            out.extend(flatten(item))
        else:
            out.append(item)
    if unique is True:
        out = list(set(out))
    if simplify is True:
        if len(out) == 0:
            return None
        if len(out) == 1:
            return(out[0])
        else:
            pass
    return out


def merge_dicts(dicts, unique=False, simplify=None):
    """
    Merge a list of dictionaries, returning a single dictionary with combined
    values from all dictionaries in the list.

    The arguments simplify and unique may be supplied as booleans or lists of
    key names. If booleans, they apply to all keys in the dictionary. If lists
    of key names, they are taken to be True for those keys, and False for all
    others.

    unique -- Removes duplicate values (default False).

    simplify -- If the function would return a length-one list, instead
    returns the contents of that list (default None). If simplify is None, the
    function will attempt to be smart, behaving as follows. If there is more
    than one item for a key, that key's value in the merged dict will be a
    list. If only one unique value is to be returned, it will be returned as a
    list if the input dict's value is stored as a list, otherwise it will be
    simplified.
    """
    if not isinstance(dicts, list):
        raise ValueError("first argument must be a list of dicts")

    merged_dicts = defaultdict(list)

    for d in dicts:
        for key, value in d.items():
            merged_dicts[key].append(value)

    for key, value in merged_dicts.items():
        u_arg = unique if isinstance(unique, bool) else (key in unique)

        if simplify is None:
            has_key = [key in d.keys() for d in dicts]
            values = [d[key] for d in compress(dicts, has_key)]
            s_arg = any([not isinstance(v, list) for v in values])
        elif isinstance(simplify, bool):
            s_arg = simplify
        else:
            (key in simplify)

        merged_dicts[key] = flatten(value, simplify=s_arg, unique=u_arg)

    return(dict(merged_dicts))


def verboseprint(verbose=False, *args, **kwargs):
    print(*args, **kwargs) if verbose else lambda *args, **kwargs: None


def median(li):
    if len(li) == 0:
        return None
    mid_idx = int((len(li) - 1) / 2)
    li = sorted(li)
    if len(li) % 2 == 1:
        return li[mid_idx]
    else:
        return (li[mid_idx] + li[mid_idx + 1]) / 2


def normalize_text(text):
    """
    Attempt to convert text to a simplified representation so it can
    be compared ignoring diacritical marks, differences in character codes
    for similar symbols, differences in whitespace, and similar issues.
    If the simplified text ends up being too short to form an
    useful representation of the text, the original text will be returned.
    That will happen when non-latin text is used because non-latin charaters are
    usually removed from the simplified text.
    """
    text = text.replace('\u2019', "'")
    result = unicodedata.normalize('NFKD', text)\
        .encode('ascii', 'ignore').decode()
    result = space_punct_re.sub(' ', result).strip()
    if len(result) < 3:
        result = space_punct_re.sub(' ', text).strip()
    return result


def normalize_disease_name(name):
    if len(name) <= 6:
        # Shorter syn_strings are likely to be acronyms so only exact
        # matches are allowed.
        return name
    else:
        name = re.sub(r"['\"]", "", name)
        return re.sub(r"[\s\-\/]+", " ", name).strip()