import collections import re import os import numpy as np def _tokenize(contents): return re.findall(r'([A-Za-z0-9\.\-_]+|[:;\{\}])', contents) def _strip_comments(file_like): """ Assumes the model file is in the tmhmm/ directory """ model = '/'.join([os.path.dirname(os.path.abspath(__file__)),file_like]) with open(model) as f: lines = f.readlines() return ''.join(filter(lambda l: not l.startswith('#'), lines)) def _parse_list(tokens): parsed_list = [] while True: token = tokens.popleft() if token == ';': tokens.appendleft(token) return tokens, parsed_list parsed_list.append(token) def _parse_map(tokens): parsed_map = collections.OrderedDict() while True: token = tokens.popleft() if token == ';': tokens.appendleft(token) return tokens, parsed_map next_token = tokens.popleft() # Fallback if the map was actually a list if next_token != ':': tokens.appendleft(next_token) tokens.appendleft(token) return tokens, None value = tokens.popleft() parsed_map[token] = float(value) def _parse_state(tokens): state_name = tokens.popleft() tokens.popleft() # "{" parsed_state = {} while True: token = tokens.popleft() if token == '}': return tokens, (state_name, parsed_state) if token in ('trans', 'only'): tokens, value = _parse_map(tokens) if value is None: tokens, value = _parse_list(tokens) elif token in ('type', 'end'): value = int(tokens.popleft()) else: value = tokens.popleft() parsed_state[token] = value tokens.popleft() # ";" def _parse_header(tokens): tokens.popleft() # "header" tokens.popleft() # "{" header = {} while True: token = tokens.popleft() if token == '}': break header[token] = tokens.popleft() tokens.popleft() # ";" return tokens, header def _normalize_states(states): """Normalize states by inheriting parameters explicitly. The TMHMM file format allows parameters to be tied to the parameters of some other state. This basically means that a state inherits the parameters from another state. The normalization performed by this function consists of copying the specified parameters from the parent state to the inheriting state such that all states explicitly specify their transition and emission probabilities. """ for name, state in states.items(): # inherit parent's transition probabilities, but only for # the states specified for this state. if 'tied_trans' in state: parent_state = states[state['tied_trans']] to_states = state['trans'] states[name]['trans'] = dict(zip(state['trans'], parent_state['trans'].values())) # inherit parent's emission probabilities if 'tied_letter' in state: parent_state = state['tied_letter'] states[name]['only'] = dict(states[parent_state]['only']) return states def _to_matrix_form(alphabet, states): """ Convert a model to matrix form. """ # pull out initial probabilities begin = dict(states['begin']) del states['begin'] # build state -> index mapping state_map = {v: k for k, v in enumerate(states)} # build character -> index mapping char_map = {v: k for k, v in enumerate(alphabet)} no_states = len(states) initial = np.zeros(shape=(no_states,)) transitions = np.zeros(shape=(no_states, no_states)) emissions = np.zeros(shape=(no_states, len(alphabet))) label_map = {} name_map = dict(enumerate(states)) # initial probabilities for state_name, trans_prob in begin['trans'].items(): this_state_idx = state_map[state_name] initial[this_state_idx] = trans_prob for state_name, state in states.items(): this_state_idx = state_map[state_name] # label map if 'label' in state: label_map[this_state_idx] = state['label'] # transition probabilities for other_state_name, trans_prob in state['trans'].items(): other_state_idx = state_map[other_state_name] transitions[this_state_idx, other_state_idx] = trans_prob # emission probabilities for character, emission_prob in state['only'].items(): this_character_idx = char_map[character] emissions[this_state_idx, this_character_idx] = emission_prob return initial, transitions, emissions, char_map, label_map, name_map def parse(file_like): """ Parse a model in the TMHMM 2.0 format. :param file_like: a file-like object to read and parse. :return: a model """ contents = _strip_comments(file_like) tokens = collections.deque(_tokenize(contents)) tokens, header = _parse_header(tokens) states = {} while tokens: tokens, (name, state) = _parse_state(tokens) states[name] = state assert not tokens, "list of tokens not consumed completely" return header, _to_matrix_form(header['alphabet'], _normalize_states(states))