python source code of reader

# Reads and tokensises all python files in the given path
import os
import tokenize

import itertools

pad_token, pad_id = "§PAD§", 0
oov_token, oov_id = "§OOV§", 1
number_token = "§NUM§"


def read_data(path, listfile):
    if isinstance(listfile, list):
        python_files = [os.path.join(path, f) for f in listfile]
    else:
        with open(listfile) as f:
            python_files = [os.path.join(path, x) for x in f.read().splitlines()]

    for filename in python_files:
        try:
            with open(filename) as f:
                tokens = list(tokenize.generate_tokens(f.readline))

                yield [preprocess(tokenType, tokenVal) for tokenType, tokenVal, _, _, _
                       in tokens
                       if tokenType != tokenize.COMMENT and
                       not tokenVal.startswith("'''") and
                       not tokenVal.startswith('"""') and
                       (tokenType == tokenize.DEDENT or tokenVal != "")]
        except:
            pass


def preprocess(tokentype, tokenval):
    if tokentype == tokenize.NUMBER:
        return number_token

    elif tokentype == tokenize.INDENT:
        return "<indent>"

    elif tokentype == tokenize.DEDENT:
        return "<dedent>"

    # Need to replace spaces with some other character because the ngram processor
    # splits on spaces
    return tokenval.replace(" ", "§").replace("\n", "<newline>")