python source code of pyreader

import tokenize
import os
import collections
from io import StringIO

import numpy as np
import sys
import itertools
import math

import astwalker
from normalisation import get_source_tree

pad_token, pad_id = "§PAD§", 0
oov_token, oov_id = "§OOV§", 1
indent_token = "§<indent>§"
dedent_token = "§<dedent>§"
number_token = "§NUM§"


class Container:
    def __init__(self, inputs, targets, actual_lengths, masks, identifier_usage):
        self.inputs = inputs
        self.targets = targets
        self.actual_lengths = actual_lengths
        self.num_sequences = len(inputs)
        self.masks = masks
        self.identifier_usage = identifier_usage


# Reads and tokensises all python files in the given path
def read_data(path, listfile, word_to_id=None, gen_def_positions=True):
    if isinstance(listfile, list):
        python_files = [os.path.join(path, f) for f in listfile]
    else:
        with open(listfile) as f:
            python_files = [os.path.join(path, x) for x in f.read().splitlines()]

    mapping = (lambda x: x) if word_to_id is None else (lambda x: word_to_id.get(x, oov_id))

    data = []
    definition_positions = []
    identifier_usage = []
    for filename in python_files:
        try:
            source, tree = get_source_tree(filename)
            tokens = tokenize.generate_tokens(StringIO(source).readline)

            data.append([(mapping(preprocess(tokenType, tokenVal)), start) for tokenType, tokenVal, start, _, _
                         in tokens
                         if tokenType != tokenize.COMMENT and
                         not tokenVal.startswith("'''") and
                         not tokenVal.startswith('"""') and
                         (tokenType == tokenize.DEDENT or tokenVal != "")])

            if gen_def_positions:
                walker = astwalker.ASTWalker()
                walker.walk(tree)
                definition_positions.append(walker.definition_positions)
                identifier_usage.append(walker.name_usage)

        except:
            print("Error when tokenizing %s: %s" % (filename, sys.exc_info()[0]))

    return data, definition_positions, identifier_usage


def preprocess(tokentype, tokenval):
    if tokentype == tokenize.NUMBER:
        return number_token

    elif tokentype == tokenize.INDENT:
        return indent_token

    elif tokentype == tokenize.DEDENT:
        return dedent_token

    return tokenval


def build_vocab(data, oov_threshold, force_include=None):
    force_include = force_include or []
    counter = collections.Counter(itertools.chain(itertools.chain(*data), force_include))
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])

    count_pairs = (p for p in count_pairs if p[1] > oov_threshold or p[0] in force_include)

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(2, len(words) + 2)))
    word_to_id[oov_token] = oov_id
    word_to_id[pad_token] = pad_id
    return word_to_id


def get_data(path, listfile, seq_length, word_to_id):
    all_data, def_positions, identifier_usage = read_data(path, listfile, word_to_id)
    identifier_types = astwalker.possible_types()
    num_masks = len(identifier_types)
    def_positions = [[[t[1] for t in fp if t[0] == k] for k in identifier_types] for fp in def_positions]
    # def_positions = [[t[1] for t in file_positions] for file_positions in def_positions]
    num_masks = len(identifier_types)
    #num_masks = 1
    data = []

    # Prevent indent and dedent tokens from being flagged as variables, which can occur because dedent
    # in particular takes up no columns
    non_vars = [word_to_id[indent_token], word_to_id[dedent_token]]

    for j in range(len(all_data)):
        filedata = list(all_data[j])
        file_positions = def_positions[j]
        identifier_positions = identifier_usage[j]

        num_sequences = math.ceil(len(filedata) / seq_length)
        input_data = np.zeros([num_sequences, seq_length]).astype("int64")
        targets = np.zeros([num_sequences, seq_length]).astype("int64")
        masks_data = np.zeros([num_sequences, num_masks, seq_length]).astype("bool")
        id_usage_data = np.zeros([num_sequences, seq_length]).astype("bool")
        actual_lengths = []

        for i in range(num_sequences):
            x = [t[0] for t in filedata[i * seq_length:(i + 1) * seq_length]]
            y = [t[0] for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]]
            masks = [[t[1] in fp and t[0] not in non_vars for fp in file_positions]
                     for t in filedata[i * seq_length:(i + 1) * seq_length]]
            # masks = [t[1] in file_positions and t[0] not in non_vars
            #          for t in filedata[i * seq_length:(i + 1) * seq_length]]
            ids = [t[1] in identifier_positions for t in filedata[i * seq_length + 1:(i + 1) * seq_length + 1]]

            actual_length_x = len(x)
            actual_length_y = len(y)

            input_data[i, :actual_length_x] = x
            targets[i, :actual_length_y] = y
            masks_data[i, :, :actual_length_x] = np.transpose(masks)
            id_usage_data[i, :actual_length_y] = ids

            actual_lengths.append(actual_length_y)

        container = Container(input_data, targets, actual_lengths, masks_data, id_usage_data)
        data.append(container)

    return data


def partition_data(data, num_partitions):
    total_sequences = sum(c.num_sequences for c in data)
    bucket_size = total_sequences // num_partitions
    data = sorted(data, key=lambda c: c.num_sequences)
    buckets = {}
    counter = 0
    for i in range(num_partitions):
        bucket = []
        assigned_sequences = 0
        while assigned_sequences < bucket_size and counter < len(data):
            bucket.append(data[counter])
            assigned_sequences += data[counter].num_sequences
            counter += 1
        buckets[i] = bucket

    return buckets


if __name__ == "__main__":
    path = "/Users/avishkar/pyRepos"
    listfile = "/Users/avishkar/pyRepos/train_files.txt"
    raw_data, _, _ = read_data(path, listfile)
    data_raw = [[t[0] for t in file_tokens] for file_tokens in raw_data]
    word_to_id = build_vocab(data_raw, 0)
    inv_map = {v: k for k, v in word_to_id.items()}
    data = get_data(path, listfile, 20, word_to_id)

    inputs = [inv_map[int(i)] for i in np.nditer(data[0].inputs)]
    masks = [bool(i) for i in np.nditer(data[0].masks)]
    output = []
    for i, input in enumerate(inputs):
        output.append("§§" + input + "§§" if masks[i] else input.replace("§<indent>§", "\t").replace("§<dedent>§", ""))

    print("Identifier introduced:")
    print(" ".join(output))

    targets = [inv_map[int(i)] for i in np.nditer(data[0].targets)]
    id_usages = [bool(i) for i in np.nditer(data[0].identifier_usage)]
    output2 = []
    for i, target in enumerate(targets):
        output2.append("§§" + target + "§§" if id_usages[i] else target.replace("§<indent>§", "\t").replace("§<dedent>§", ""))

    print("Identifier Used:")
    print(" ".join(output2))