#!/usr/bin/env python #-*- coding: utf-8 -*- """ Simple tokenizer. Remove or replace unwanted characters, and parse to a list of lists of sentences and words """ import re, unicodedata def remove_accents(string): """ Removes unicode accents from a string, downgrading to the base character """ nfkd = unicodedata.normalize('NFKD', string) return u"".join([c for c in nfkd if not unicodedata.combining(c)]) def tokenize(poem): tokens = [] # Problematic characters to replace before regex replacements = {u'-': u' ', u'—': u' ', u'\'d': u'ed'} for original, replacement in replacements.items(): replaced = poem.replace(original, replacement) replaced = remove_accents(replaced) # Keep apostrophes, discard other non-alphanumeric symbols cleaned = re.sub(r'[^0-9a-zA-Z\s\']', '', replaced) for line in cleaned.split('\n'): tokens.append([word for word in line.strip().split(' ')]) return tokens