## Spark Application for performing SGD regression on wines.

import csv

from numpy import array
from StringIO import StringIO

from pyspark import SparkConf, SparkContext
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD

# Load and parse the data
def parsePoint(line):
    values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
    values = [float(x) for x in values]                       # Cast to all floats
    return LabeledPoint(values[-1], values[:-1])              # y = quality, X = row[:-1]

if __name__ == '__main__':
    conf = SparkConf().setAppName("Wine Regression")
    sc   = SparkContext(conf=conf)

    wines = sc.textFile("fixtures/winequality/wines.csv")
    parsedData = wines.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).take(5).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))