import json import random import re import time import zlib from enum import unique, Enum from typing import Optional, List import numpy as np from spacy.tokens import Doc, Span, Token from config.ml import MARKOV_WINDOW_SIZE, MARKOV_GENERATION_WEIGHT_COUNT, MARKOV_GENERATION_WEIGHT_RATING, \ MARKOV_GENERATE_SUBJECT_POS_PRIORITY, MARKOV_GENERATE_SUBJECT_MAX, \ CAPITALIZATION_COMPOUND_RULES, MARKOV_MODEL_TEMPERATURE from common.ml import one_hot, temp from common.nlp import Pos, CapitalizationMode class WordKey(object): TEXT = '_T' POS = '_P' COMPOUND = '_C' @unique class NeighborIdx(Enum): TEXT = 0 POS = 1 COMPOUND = 2 VALUE_MATRIX = 3 DISTANCE_MATRIX = 4 @unique class NeighborValueIdx(Enum): COUNT = 0 RATING = 1 class MarkovNeighbor(object): def __init__(self, key: str, text: str, pos: Pos, compound: bool, values: list, dist: list): self.key = key self.text = text self.pos = pos self.compound = compound self.values = values self.dist = dist def __repr__(self): return self.text @staticmethod def from_token(token: Token) -> 'MarkovNeighbor': key = token.text.lower() text = token.text if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False pos = Pos.from_token(token) values = [0, 0] dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1) return MarkovNeighbor(key, text, pos, compound, values, dist) @staticmethod def from_db_format(key: str, val: list) -> 'MarkovNeighbor': key = key text = val[NeighborIdx.TEXT.value] pos = Pos(val[NeighborIdx.POS.value]) compound = val[NeighborIdx.COMPOUND.value] values = val[NeighborIdx.VALUE_MATRIX.value] dist = val[NeighborIdx.DISTANCE_MATRIX.value] return MarkovNeighbor(key, text, pos, compound, values, dist) def to_db_format(self) -> tuple: return self.key, [self.text, self.pos.value, self.compound, self.values, self.dist] @staticmethod def distance_one_hot(dist): return one_hot(dist + MARKOV_WINDOW_SIZE, MARKOV_WINDOW_SIZE * 2 + 1) class MarkovNeighbors(object): def __init__(self, neighbors: List[MarkovNeighbor]): self.neighbors = neighbors def __iter__(self) -> MarkovNeighbor: for neighbor in self.neighbors: yield neighbor def __len__(self): return len(self.neighbors) def __getitem__(self, item): return self.neighbors[item] @unique class ProjectionDirection(Enum): LEFT = 1 RIGHT = 2 class MarkovWordProjection(object): def __init__(self, magnitudes: np.ndarray, distances: np.ndarray, keys: List[str], pos: List[Pos]): self.magnitudes = magnitudes self.distances = distances self.keys = keys self.pos = pos def __len__(self): return len(self.keys) class MarkovWordProjectionCollection(object): def __init__(self, projections: List[MarkovWordProjection]): self.magnitudes = None self.distances = None self.keys = [] self.pos = [] self._concat_collection(projections) def _concat_collection(self, projections: List[MarkovWordProjection]): for projection_idx, projection in enumerate(projections): self.keys += projection.keys self.pos += projection.pos if projection_idx == 0: self.magnitudes = projection.magnitudes self.distances = projection.distances else: self.magnitudes = np.concatenate((self.magnitudes, projection.magnitudes)) self.distances = np.concatenate((self.distances, projection.distances)) def probability_matrix(self) -> np.ndarray: distance_magnitudes = self.distances * self.magnitudes sums = np.sum(distance_magnitudes, axis=0) p_values = distance_magnitudes / sums return p_values def __len__(self): return len(self.keys) class MarkovWord(object): def __init__(self, text: str, pos: Pos, compound: bool, neighbors: dict): self.text = text self.pos = pos self.compound = compound self.neighbors = neighbors def __repr__(self): return self.text def to_db_format(self) -> tuple: return {WordKey.TEXT: self.text, WordKey.POS: self.pos.value, WordKey.COMPOUND: self.compound}, { MarkovTrieDb.NEIGHBORS_KEY: self.neighbors} @staticmethod def from_db_format(row: dict) -> 'MarkovWord': word = MarkovWord(row[MarkovTrieDb.WORD_KEY][WordKey.TEXT], Pos(row[MarkovTrieDb.WORD_KEY][WordKey.POS]), row[MarkovTrieDb.WORD_KEY][WordKey.COMPOUND], row[MarkovTrieDb.NEIGHBORS_KEY]) return word @staticmethod def from_token(token: Token) -> 'MarkovWord': if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={}) def get_neighbor(self, key: str) -> Optional[MarkovNeighbor]: if key in self.neighbors: n_row = self.neighbors[key] return MarkovNeighbor.from_db_format(key, n_row) return None def set_neighbor(self, neighbor: MarkovNeighbor): key, row = neighbor.to_db_format() self.neighbors[key] = row def select_neighbors(self, pos: Optional[Pos], exclude_key: Optional[str] = None) -> MarkovNeighbors: results = [] for key in self.neighbors: neighbor = self.get_neighbor(key) if exclude_key is not None and exclude_key == neighbor.key: continue elif pos == neighbor.pos or pos is None: results.append(neighbor) return MarkovNeighbors(results) def project(self, idx_in_sentence: int, sentence_length: int, pos: Pos, exclude_key: Optional[str] = None) -> MarkovWordProjection: # Get all neighbors neighbors = self.select_neighbors(pos, exclude_key=exclude_key) neighbor_keys = [] neighbor_pos = [] # Setup matrices distance_distributions = np.zeros((len(neighbors), sentence_length)) neighbor_magnitudes = np.zeros((len(neighbors), 1)) for neighbor_idx, neighbor in enumerate(neighbors): # Save Key neighbor_keys.append(neighbor.text) neighbor_pos.append(neighbor.pos) # Project dist values onto matrix space for dist_idx, dist_value in enumerate(neighbor.dist): # The actual index of this dist value within our matrix space dist_space_index = (dist_idx - MARKOV_WINDOW_SIZE) + idx_in_sentence # Bounds check if not (dist_space_index >= 0 and dist_space_index < sentence_length): continue distance_distributions[neighbor_idx][dist_space_index] = dist_value # Calculate strength neighbor_magnitudes[neighbor_idx] = neighbor.values[ NeighborValueIdx.COUNT.value] * MARKOV_GENERATION_WEIGHT_COUNT \ + \ neighbor.values[ NeighborValueIdx.RATING.value] * MARKOV_GENERATION_WEIGHT_RATING return MarkovWordProjection(neighbor_magnitudes, distance_distributions, neighbor_keys, neighbor_pos) class GeneratedWord(MarkovWord): def __init__(self, text: str, pos: Pos, compound: bool, neighbors: dict, mode: CapitalizationMode): MarkovWord.__init__(self, text, pos, compound, neighbors) self.mode = mode @staticmethod def from_markov_word(word: MarkovWord, mode: CapitalizationMode): return GeneratedWord(word.text, word.pos, word.compound, word.neighbors, mode=mode) class MarkovTrieDb(object): WORD_KEY = '_W' NEIGHBORS_KEY = '_N' def __init__(self, path: str = None): np.random.seed(int(time.time())) self._trie = {} if path is not None: self.load(path) def load(self, path: str): data = zlib.decompress(open(path, 'rb').read()).decode() self._trie = json.loads(data) def save(self, path: str): data = zlib.compress(json.dumps(self._trie, separators=(',', ':')).encode()) open(path, 'wb').write(data) def _getnode(self, word: str) -> Optional[dict]: if len(word) == 0: return None node = self._trie for c in word: try: node = node[c.lower()] except KeyError: return None return node def _select(self, word: str) -> Optional[dict]: node = self._getnode(word) if node is None: return None if MarkovTrieDb.WORD_KEY in node: return node else: return None def select(self, word: str) -> MarkovWord: row = self._select(word) return MarkovWord.from_db_format(row) if row is not None else None def _insert(self, word: str, pos: int, compound: bool, neighbors: dict) -> Optional[dict]: node = self._trie for c in word: if c.lower() in node: node = node[c.lower()] else: node[c.lower()] = {} node = node[c.lower()] node[MarkovTrieDb.WORD_KEY] = {WordKey.TEXT: word, WordKey.POS: pos, WordKey.COMPOUND: compound} node[MarkovTrieDb.NEIGHBORS_KEY] = neighbors return node def insert(self, word: MarkovWord) -> MarkovWord: row = self._insert(word.text, word.pos.value, word.compound, word.neighbors) return MarkovWord.from_db_format(row) if row is not None else None def _update(self, word: str, pos: int, compound: bool, neighbors: dict) -> Optional[dict]: node = self._select(word) if node is None: return None node[MarkovTrieDb.WORD_KEY] = {WordKey.TEXT: word, WordKey.POS: pos, WordKey.COMPOUND: compound} node[MarkovTrieDb.NEIGHBORS_KEY] = neighbors return node def update(self, word: MarkovWord) -> Optional[MarkovWord]: node = self._update(word.text, word.pos.value, word.compound, word.neighbors) return MarkovWord.from_db_format(node) if node is not None else None class MarkovGenerator(object): def __init__(self, structure_generator, subjects: List[MarkovWord]): self.structure_generator = structure_generator self.subjects = subjects self.sentence_generations = [] self.sentence_structures = [] def _reset_data(self): self.sentence_generations = [] self.sentence_structures = [] def _sort_subjects(self): sorted_subjects = [] for subject_priority in MARKOV_GENERATE_SUBJECT_POS_PRIORITY: for subject in self.subjects: if subject.pos == subject_priority: sorted_subjects.append(subject) self.subjects = sorted_subjects def generate(self, db: MarkovTrieDb) -> Optional[List[List[GeneratedWord]]]: # Try to much subject to a variety of sentence structures subjects_assigned = False for i in range(0, 10): self._split_sentences() self._sort_subjects() if self._assign_subjects(): subjects_assigned = True break self._reset_data() if not subjects_assigned: return None if not self._generate_words(db): approximation = [] for sentence in self.sentence_generations: for word in sentence: if word is not None: approximation.append(word) if len(approximation) > 0: return [approximation] else: return None return self.sentence_generations # Split into individual sentences and populate generation arrays def _split_sentences(self): structure = next(self.structure_generator) start_index = 0 for word_idx, word in enumerate(structure): if word.pos == Pos.EOS: # Separate structures into sentences sentence = structure[start_index:word_idx] self.sentence_structures.append(sentence) # Create unfilled arrays for each sentence to populate later generates = [None] * len(sentence) self.sentence_generations.append(generates) start_index = word_idx + 1 # Assign one subject to each sentence with descending priority def _assign_subjects(self) -> bool: sentences_assigned = [False] * len(self.sentence_structures) for sentence_idx, sentence in enumerate(self.sentence_structures): subjects_assigned = [] word_break = False for word_idx, word in enumerate(sentence): for subject in self.subjects: # Assigned a maximum number of subjects per sentence if len(subjects_assigned) > MARKOV_GENERATE_SUBJECT_MAX: word_break = True break # Don't assign sam subject twice pers sentence if subject.text in subjects_assigned: continue if subject.pos == word.pos: subjects_assigned.append(subject.text) self.sentence_generations[sentence_idx][word_idx] = GeneratedWord.from_markov_word( subject, self.sentence_structures[sentence_idx][word_idx].mode) sentences_assigned[sentence_idx] = True break if word_break: break # Each sentence should be assigned one subject to begin with for sentence in sentences_assigned: if sentence is False: return False return True def _work_remaining(self): work_left = 0 for sentence_idx, sentence in enumerate(self.sentence_generations): for word_idx, word in enumerate(sentence): if word is None: work_left += 1 return work_left def _generate_words(self, db: MarkovTrieDb): old_work_left = self._work_remaining() while True: for sentence_idx, sentence in enumerate(self.sentence_generations): sentence_length = len(sentence) def handle_projections(exclude_key: Optional[str] = None) -> bool: if blank_idx is None or len(project_idx) == 0: return False projections = [] blank_pos = self.sentence_structures[sentence_idx][blank_idx].pos for word_idx in project_idx: projecting_word = self.sentence_generations[sentence_idx][word_idx] projection = projecting_word.project(word_idx, sentence_length, blank_pos, exclude_key=exclude_key) projections.append(projection) # Concatenate all projections and create p-value matrix projection_collection = MarkovWordProjectionCollection(projections) if len(projection_collection) == 0: return False all_p_values = projection_collection.probability_matrix() # We just want the p-values for the blank word p_values = all_p_values[:, blank_idx] # Choose an index based on the probability choices = np.arange(len(projection_collection)) word_choice_idx = temp(p_values, temperature=MARKOV_MODEL_TEMPERATURE) # Select the word from the database and assign it to the blank space select_word = projection_collection.keys[word_choice_idx] word = GeneratedWord.from_markov_word(db.select(select_word), self.sentence_structures[sentence_idx][blank_idx].mode) self.sentence_generations[sentence_idx][blank_idx] = word # Scan left to right # Work right to left blank_idx = None project_idx = [] for word_idx, word in enumerate(sentence): if word is None: blank_idx = word_idx elif blank_idx is not None and abs(blank_idx - word_idx) <= MARKOV_WINDOW_SIZE: project_idx.append(word_idx) break handle_projections() # Scan right to left # Work left to right blank_idx = None project_idx = [] for word_idx, word in enumerate(reversed(sentence)): word_idx = (len(sentence) - 1) - word_idx if word is None: blank_idx = word_idx elif blank_idx is not None and abs(blank_idx - word_idx) <= MARKOV_WINDOW_SIZE: project_idx.append(word_idx) break handle_projections() # Check if we accomplished any work new_work_left = self._work_remaining() if old_work_left == new_work_left: return False elif new_work_left == 0: return True old_work_left = new_work_left class MarkovFilters(object): @staticmethod def filter_input(text: str): if text is None: return None filtered = text urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) # Replace all URLS with a unique token url_token = 'URL%s' % random.getrandbits(64) for url in urls: filtered = filtered.replace(url, url_token) filtered = re.sub(r'(&)', '', filtered) filtered = re.sub(r'[,:;\'`\-_“^"<>(){}/\\*]', '', filtered) # Swamp URLs back for token for url in urls: filtered = filtered.replace(url_token, url) return filtered @staticmethod def smooth_output(text: str): if text is None: return None smoothed = text smoothed = re.sub(r'([$]) ', r'\1', smoothed) smoothed = re.sub(r' ([.,?!%])', r'\1', smoothed) smoothed = re.sub(r' ([\']) ', r'\1', smoothed) return smoothed class MarkovTrainer(object): def __init__(self, engine: MarkovTrieDb): self.engine = engine def learn(self, doc: Doc): bi_grams = [] for sentence in doc.sents: bi_grams += MarkovTrainer.span_to_bigram(sentence) row_cache = {} for ngram in bi_grams: if ngram[0].text in row_cache: word = row_cache[ngram[0].text] else: # Attempt to load from DB word = self.engine.select(ngram[0].text) if word is None: # If not already in the DB, create a new word object word = MarkovWord.from_token(ngram[0]) # Handle neighbor neighbor_lookup_key = ngram[1].text.lower() neighbor = word.get_neighbor(neighbor_lookup_key) if neighbor is None: neighbor = MarkovNeighbor.from_token(ngram[1]) # Increase Count neighbor.values[NeighborValueIdx.COUNT.value] += 1 # Add distance dist_one_hot_base = np.array(neighbor.dist) dist_one_hot_add = np.array(MarkovNeighbor.distance_one_hot(ngram[2])) neighbor.dist = (dist_one_hot_base + dist_one_hot_add).tolist() # Convert to db format and store in word key, neighbor_db = neighbor.to_db_format() word.neighbors[key] = neighbor_db # Write word to DB if self.engine.update(word) is None: self.engine.insert(word) # Cache word row_cache[ngram[0].text] = word @staticmethod def span_to_bigram(span: Span) -> list: grams = [] for a_idx, a in enumerate(span): for b_idx, b in enumerate(span): dist = b_idx - a_idx if dist == 0: continue elif abs(dist) <= MARKOV_WINDOW_SIZE: grams.append([a, b, dist]) return grams