python source code of celex

"""Tools for working with Celex."""
import re
import csv
import pandas as pd

from ..base import reader, segment_phonology
from itertools import chain
from functools import partial


remove_double = re.compile(r"ː+")


PROJECT2FIELD = {("eng", False): {'orthography': 1,
                                  'phonology': 8,
                                  'frequency': 2,
                                  'syllables': 8},
                 ("nld", False): {'orthography': 1,
                                  'phonology': 6,
                                  'frequency': 2,
                                  'syllables': 6},
                 ("deu", False): {'orthography': 1,
                                  'phonology': 5,
                                  'frequency': 2,
                                  'syllables': 5},
                 ("eng", True): {'orthography': 1,
                                 'phonology': 7,
                                 'frequency': 2,
                                 'syllables': 7},
                 ("nld", True): {'orthography': 1,
                                 'phonology': 5,
                                 'frequency': 2,
                                 'syllables': 5},
                 ("deu", True): {'orthography': 1,
                                 'phonology': 4,
                                 'frequency': 2,
                                 'syllables': 4}}

lengths = {("nld", True): (11, 0),
           ("eng", True): (4, 4),
           ("deu", True): (11, 0),
           ("nld", False): (7, 0),
           ("eng", False): (5, 4),
           ("deu", False): (7, 0)}

CELEX_2IPA = {"O~": "ɒ̃",
              "A~": "ɒ",
              "&~": "æ",
              "p": "p",
              "b": "b",
              "t": "t",
              "d": "d",
              "k": "k",
              "g": "ɡ",
              "N": "ŋ",
              "m": "m",
              "n": "n",
              "l": "l",
              "r": "r",
              "f": "f",
              "v": "v",
              "T": "θ",
              "D": "ð",
              "s": "s",
              "z": "z",
              "S": "ʃ",
              "Z": "ʒ",
              "j": "j",
              "x": "x",
              "h": "h",
              "w": "w",
              "I": "ɪ",
              "E": "ɛ",
              "&": "æ",
              "V": "ʌ",
              "Q": "ɒ",
              "U": "ʊ",
              "@": "ə",
              "O": "ɔ",
              "3": "ɜ",
              "A": "ɑ",
              "a": "a",
              "e": "e",
              "i": "i",
              "o": "o",
              "u": "u",
              "G": "x",
              "y": "y",
              ":": "ː"}

celex_regex = re.compile(r"{}".format("|".join(CELEX_2IPA.keys())))
replace = re.compile(r"(,|r\*)")
braces = re.compile(r"[\[\]]+")
double_braces = re.compile(r"(\[[^\]]+?)\[(.+?)\]([^\[])")


def syll_func(string):
    """Process a CELEX syllable string."""
    syll = double_braces.sub(r"\g<1>\g<2>][\g<2>\g<3>",
                             string)
    syll = [replace.sub("", x)
            for x in braces.split(syll) if x]
    syll = [segment_phonology(x) for x in celex_to_ipa(syll)]
    return tuple(syll)


def phon_func(string):
    """Process a CELEX phonology string."""
    phon = [replace.sub("", x)
            for x in braces.split(string) if x]
    phon = [segment_phonology(x) for x in celex_to_ipa(phon)]
    return tuple(chain.from_iterable(phon))


def celex_to_ipa(syllables):
    """Convert celex phonemes to IPA unicode format."""
    for syll in syllables:
        yield "".join([CELEX_2IPA[p] for p in celex_regex.findall(syll)])


def _celex_opener(path, word_length, struct_length=0, **kwargs):
    """Open a CELEX file for reading."""
    csv_file = csv.reader(open(path), **kwargs)
    data = []
    for line in csv_file:
        rem = len(line) - word_length
        if rem != 0:
            if not struct_length:
                raise ValueError(line)
            if rem % struct_length:
                raise ValueError(line)

        inform = line[:word_length]
        if struct_length == 0:
            data.append(dict(enumerate(inform)))
            continue
        for x in range(word_length, len(line), struct_length):
            data.append(dict(enumerate(inform + line[x:x+struct_length])))

    return pd.DataFrame(data)


def _celex(path,
           fields,
           lemmas,
           language):
    w_length, s_length = lengths[(language, lemmas)]
    _opener = partial(_celex_opener,
                      word_length=w_length,
                      struct_length=s_length)

    return reader(path,
                  fields,
                  PROJECT2FIELD[(language, lemmas)],
                  language,
                  delimiter="\\",
                  quoting=csv.QUOTE_NONE,
                  opener=_opener,
                  preprocessors={"phonology": phon_func,
                                 "syllables": syll_func})


def celex_english(path,
                  fields=("orthography", "syllables", "frequency"),
                  lemmas=False):
    return _celex(path, fields, lemmas, "eng")


def celex_dutch(path,
                fields=("orthography", "syllables", "frequency"),
                lemmas=False):
    return _celex(path, fields, lemmas, "nld")


def celex_german(path,
                 fields=("orthography", "syllables", "frequency"),
                 lemmas=False):
    return _celex(path, fields, lemmas, "deu")