"""Tokenizers for natural language SQL queries, and lambda calculus.""" import nltk import sqlparse def nl_tokenize(string): """Tokenizes a natural language string into tokens. Inputs: string: the string to tokenize. Outputs: a list of tokens. Assumes data is space-separated (this is true of ZC07 data in ATIS2/3). """ return nltk.word_tokenize(string) def sql_tokenize(string): """ Tokenizes a SQL statement into tokens. Inputs: string: string to tokenize. Outputs: a list of tokens. """ tokens = [] statements = sqlparse.parse(string) # SQLparse gives you a list of statements. for statement in statements: # Flatten the tokens in each statement and add to the tokens list. flat_tokens = sqlparse.sql.TokenList(statement.tokens).flatten() for token in flat_tokens: strip_token = str(token).strip() if len(strip_token) > 0: tokens.append(strip_token) newtokens = [] keep = True for i, token in enumerate(tokens): if token == ".": newtoken = newtokens[-1] + "." + tokens[i + 1] newtokens = newtokens[:-1] + [newtoken] keep = False elif keep: newtokens.append(token) else: keep = True return newtokens def lambda_tokenize(string): """ Tokenizes a lambda-calculus statement into tokens. Inputs: string: a lambda-calculus string Outputs: a list of tokens. """ space_separated = string.split(" ") new_tokens = [] # Separate the string by spaces, then separate based on existence of ( or # ). for token in space_separated: tokens = [] current_token = "" for char in token: if char == ")" or char == "(": tokens.append(current_token) tokens.append(char) current_token = "" else: current_token += char tokens.append(current_token) new_tokens.extend([tok for tok in tokens if tok]) return new_tokens