python source code of spark

'''
The mapping to Apache Spark.

Run as follows::

    spark-submit --py-files=<path to dispel4py binary distribution egg>
        dispel4py/new/spark_process.py <graph module or full path>
        [-f <input file>]
        [-d <input data>]
        [-a <graph attribute>]

The graph module must either be included in the python package with
`--py-files` and specified as a module name, or, if providing a filename,
the filename must be absolute and the file must be available on each node of
the Spark cluster.

For example::

    spark-submit --py-files=/path/to/dispel4py-1.0.1-py2.7.egg
        dispel4py/new/spark_process.py
        dispel4py.examples.graph_testing.pipeline_test -i 10

If the input JSON data contains a URI to a text file this will be read as a
Spark textfile input with one record per line.
'''

import argparse
import sys
import types


if sys.version_info == (3,):
    xrange = range


def simpleLogger(self, msg):
    print("%s: %s" % (self.id, msg))


class PEWrapper(object):

    def __init__(self, pe):
        self.pe = pe
        self.pe.log = types.MethodType(simpleLogger, pe)
        self.pe.preprocess()

    def process(self, data):
        # self.pe.log('processing %s' % data)
        for output, desc in self.pe.outputconnections.items():
            desc['writer'] = SimpleWriter(output)
        result = self.pe.process(data)
        # self.pe.log('result: %s' % result)
        written = []
        if result is not None:
            written.append(result)
        for output, desc in self.pe.outputconnections.items():
            written.extend(desc['writer'].data)
        # self.pe.log('writing: %s' % written)
        return written


class SimpleWriter(object):

    def __init__(self, output_name):
        self.output_name = output_name
        self.data = []

    def write(self, data):
        self.data.append({'output': data})


def parse_args():
    parser = argparse.ArgumentParser(
        description='Submit a dispel4py graph to Apache Spark.')
    parser.add_argument('module', help='module that creates a dispel4py graph '
                        '(python module or file name)')
    parser.add_argument('-a', '--attr', metavar='attribute',
                        help='name of graph variable in the module')
    parser.add_argument('-f', '--file', metavar='inputfile',
                        help='file containing input dataset in JSON format')
    parser.add_argument('-d', '--data', metavar='inputdata',
                        help='input dataset in JSON format')
    parser.add_argument('-i', '--iter', metavar='iterations', type=int,
                        help='number of iterations', default=1)
    parser.add_argument('-sm', '--master', help='master URL for the cluster')
    parser.add_argument('-sn', '--name', help='name of the Spark process')
    parser.add_argument(
        '-sd',
        '--deploy-mode',
        choices=['cluster', 'client'],
        help='deploy driver on worker nodes or locally as external client')
    result = parser.parse_args()
    return result


class Projection(object):

    def __init__(self, outputs):
        self.outputs = outputs

    def project(self, data):
        result = {}
        for o in self.outputs:
            if o in data:
                result[o] = data[o]
        if result:
            return [result]
        else:
            return []


class Rename(object):

    def __init__(self, mapping):
        self.mapping = mapping

    def rename(self, data):
        # print('renaming data: %s, mapping is: %s' % (data, self.mapping,))
        result = {}
        for o, i in self.mapping.items():
            if o in data:
                result[i] = data[o]
        if result:
            return [result]
        else:
            return []


def process(sc, workflow, inputs, args):

    from dispel4py.new.processor\
        import assign_and_connect, _order_by_dependency
    graph = workflow.graph
    result = assign_and_connect(workflow, graph.number_of_nodes())
    if result is None:
        return

    processes, inputmappings, outputmappings = result
    process_to_pes = {}
    wrappers = {}
    for node in workflow.graph.nodes():
        pe = node.getContainedObject()
        wrapper = PEWrapper(pe)
        for p in processes[pe.id]:
            process_to_pes[p] = pe
            wrappers[p] = wrapper
    # print('Processes: %s' % processes)
    # print(inputmappings)
    # print(outputmappings)
    ordered = _order_by_dependency(inputmappings, outputmappings)
    # print('Ordered processes: %s' % ordered)
    output_rdd = {}
    result_rdd = {}

    for proc in ordered:
        inps = inputmappings[proc]
        outs = outputmappings[proc]
        wrapper = wrappers[proc]
        pe = process_to_pes[proc]
        if inps:
            if len(inps) == 1:
                for input_name, sources in inps.items():
                    inp_rdd = output_rdd[(sources[0], input_name)]
                out_rdd = inp_rdd.flatMap(wrapper.process)
            else:
                prev = None
                for input_name, sources in inps.items():
                    inp_rdd = output_rdd[(sources[0], input_name)]
                    if prev:
                        inp_rdd = prev.union(inp_rdd)
                    prev = inp_rdd
                out_rdd = prev.flatMap(wrapper.process)
            if len(outs) == 1:
                for output_name in outs:
                    for inp in outs[output_name]:
                        input_name = inp[0]
                        rename = Rename({output_name: input_name})
                        output_rdd[(proc, input_name)] = \
                            out_rdd.flatMap(rename.rename)
            else:
                for output_name in outs:
                    proj = Projection([output_name])
                    proj_rdd = out_rdd.flatMap(proj.project)
                    for inp in outs[output_name]:
                        rename = Rename({output_name: inp[0]})
                        output_rdd[(proc, inp[0])] = \
                            proj_rdd.flatMap(rename.rename)
            if not outs:
                result_rdd[proc] = out_rdd

        else:
            pe_input = inputs[pe.id]
            if type(pe_input) is list:
                # only one slice so there no repetitions - not the best
                start_rdd = sc.parallelize(pe_input, 1)
            elif isinstance(pe_input, int):
                start_rdd = sc.parallelize(xrange(pe_input), 1)
            else:
                # fingers crossed it's a string and the file exists!
                start_rdd = sc.textFile(pe_input)
            out_rdd = start_rdd.flatMap(wrapper.process)
            if len(outs) == 1:
                for output_name in outs:
                    for inp in outs[output_name]:
                        input_name = inp[0]
                        rename = Rename({output_name: input_name})
                        output_rdd[(proc, input_name)] = \
                            out_rdd.flatMap(rename.rename)
            else:
                for output_name in outs:
                    proj = Projection([output_name])
                    out_rdd = out_rdd.flatMap(proj.project).persist()
                    for inp in outs[output_name]:
                        input_name = inp[0]
                        rename = Rename({output_name: input_name})
                        output_rdd[(proc, input_name)] = \
                            out_rdd.flatMap(rename.rename)
            if not outs:
                result_rdd[proc] = out_rdd
    # print("RESULT PROCESSES: %s" % result_rdd.keys())
    for p in result_rdd:
        result = result_rdd[p].foreach(lambda x: None)
        # result = result_rdd[p].foreach(lambda x:
        #                                simpleLogger(process_to_pes[p], x))
        # print 'RESULT FROM %s: %s' % (p, result)


def run():
    from pyspark import SparkContext, SparkConf

    conf = SparkConf()
    conf.setAppName('dispel4py')
    conf.set("spark.storage.memoryFraction", "0.5")
    sc = SparkContext(
        conf=conf)

    from dispel4py.new import processor
    from dispel4py.utils import load_graph

    args = parse_args()

    graph = load_graph(args.module, args.attr)
    if graph is None:
        return
    graph.flatten()

    inputs = processor.create_inputs(args, graph)

    process(sc, graph, inputs=inputs, args=args)


def main():
    import os
    import subprocess
    import sys
    try:
        spark_home = os.environ['SPARK_HOME']
    except:
        print('Please set SPARK_HOME.')
        sys.exit(1)

    parser = argparse.ArgumentParser(
        description='Submit a dispel4py graph for processing.')
    parser.add_argument('target', help='target execution platform')
    args, remaining = parser.parse_known_args()
    this_path = os.path.abspath(__file__)
    if this_path.endswith('pyc'):
        this_path = this_path[:-1]
    command = ['%s/bin/spark-submit' % spark_home,
               '--py-files=dist/dispel4py-1.0.1-py2.7.egg',
               this_path] + remaining
    print(command)
    subprocess.call(command)

if __name__ == '__main__':
    run()