python source code of morphoconllu

#!/usr/bin/env python

# Variant of CoNLL-U support library that operates on morphology only.

# Specifically, supports working on files that only contain the fields
# FORM, LEMMA, CPOSTAG and FEATS.

import re

from itertools import groupby

from config import DEBUG

# feature name-value separator
FSEP = '='

class FormatError(Exception):
    def __init__(self, msg, line=None, linenum=None):
        self.msg = msg
        self.line = line
        self.linenum = linenum

    def __str__(self):        
        msg = self.msg
        if self.line is not None:
            msg += ' "'+self.line.encode('ascii', 'replace')+'"'
        if self.linenum is not None:
            msg += ' (line %d)' % self.linenum
        return msg

CPOSTAG_RE = re.compile(r'^[a-zA-Z]+$')
POSTAG_RE = re.compile(r'^[a-zA-Z]+$')

class Element(object):
    def __init__(self, form, lemma, cpostag, postag, feats):
        self.form = form
        self.lemma = lemma
        self.cpostag = cpostag
        self.postag = postag
        self._feats = feats

        if DEBUG:
            self.validate()

        self._fmap = None

    def validate(self):
        # minimal format validation (incomplete)

        # some character set constraints
        if not CPOSTAG_RE.match(self.cpostag):
            raise FormatError('invalid CPOSTAG: %s' % self.cpostag)
        if not POSTAG_RE.match(self.postag):
            raise FormatError('invalid CPOSTAG: %s' % self.postag)

        # no feature is empty
        if any(True for s in self._feats if len(s) == 0):
            raise FormatError('empty feature: %s' % str(self._feats))

        # feature names and values separated by feature separator
        if any(s for s in self._feats if len(s.split(FSEP)) < 2):
            raise FormatError('invalid features: %s' % str(self._feats))

        # no feature name repeats
        if any(n for n, g in groupby(sorted(s.split(FSEP)[0] for s in self._feats))
               if len(list(g)) > 1):
            raise FormatError('duplicate features: %s' % str(self._feats))

    def is_word(self):
        try:
            val = int(self.id)
            return True
        except ValueError:
            return False

    def has_feat(self, name):
        return name in self.feat_map()

    def add_feats(self, feats):
        # name-value pairs
        assert not any(nv for nv in feats if len(nv) != 2)
        self._feats.extend(FSEP.join(nv) for nv in feats)
        self._fmap = None

    def set_feats(self, feats):
        self._feats = []
        self.add_feats(feats)
        self._fmap = None

    def remove_feat(self, name, value):
        nv = FSEP.join((name, value))
        self._feats.remove(nv)
        self._fmap = None

    def append_misc(self, value):
        if self.misc == '_':
            self.misc = value
        else:
            self.misc = self.misc + '|' + value

    def feat_names(self):
        return [f.split(FSEP)[0] for f in self._feats]

    def feat_map(self):
        if self._fmap is None:
            try:
                self._fmap = dict([f.split(FSEP, 1) for f in self._feats])
            except ValueError:
                raise ValueError('failed to convert ' + str(self._feats))
        return self._fmap

    def wipe_annotation(self):
        self.lemma = '_'
        self.cpostag = '_'
        self.postag = '_'
        self._feats = '_'

    def __unicode__(self):
        fields = [self.form, self.lemma, self.cpostag, self.postag, self._feats]
        fields[4] = '_' if fields[4] == [] else '|'.join(sorted(fields[4], key=lambda s: s.lower())) # feats
        return '\t'.join(fields)

    @classmethod
    def from_string(cls, s):
        fields = s.split('\t')
        if len(fields) != 5:
            raise FormatError('%d fields' % len(fields), s)
        fields[4] = [] if fields[4] == '_' else fields[4].split('|') # feats
        return cls(*fields)

class DummySentence(object):
    """Dummy single-word "sentence" used to fake the CoNLL-U library API."""

    def __init__(self, word):
        self.word = word

    def words(self):
        yield self.word

    def __unicode__(self):
        return unicode(self.word)
        
def read_conllu(f):
    for ln, line in enumerate(f):
        line = line.rstrip('\n')
        if not line:
            yield line # TODO
        elif line[0] == '#':
            yield line #TODO
        else:
            try:
                yield DummySentence(Element.from_string(line))
            except FormatError, e:
                e.linenum = ln+1
                raise e