python source code of stockpredictor

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov  2 07:36:10 2016
@author: Nicholas Smith
"""
# Used for numpy arrays
import numpy as np
# Used to read data from CSV file
import pandas as pd
# Used to convert date string to numerical value
from datetime import datetime, timedelta
# Used to plot data
import matplotlib.pyplot as mpl
# Used to scale data
from sklearn.preprocessing import StandardScaler
# Used to perform CV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import cross_val_score

# Gives a list of timestamps from the start date to the end date
#
# startDate:     The start date as a string xxxx-xx-xx
# endDate:       The end date as a string year-month-day
# period:		 'minute', 'daily', 'weekly', or 'monthly'
# weekends:      True if weekends should be included; false otherwise
# return:        A numpy array of timestamps
def DateRange(startDate, endDate, period, weekends=True):
    # The start and end date
    sd = datetime.strptime(startDate, '%Y-%m-%d %H:%M:%S')
    ed = datetime.strptime(endDate, '%Y-%m-%d %H:%M:%S')
    # Invalid start and end dates
    if (sd > ed):
        raise ValueError("The start date cannot be later than the end date.")
    # One time period is a day
    if (period == 'minute'):
        prd = timedelta(minutes=1)
    if (period == 'daily'):
        prd = timedelta(1)
    # One prediction per week
    if (period == 'weekly'):
        prd = timedelta(7)
    # one prediction every 30 days ("month")
    if (period == 'monthly'):
        prd = timedelta(30)
    # The final list of timestamp data
    dates = []
    cd = sd
    while (cd <= ed):
        # If weekdays are included or it's a weekday append the current ts
        if (weekends or (cd.date().weekday() != 5 and cd.date().weekday() != 6)):
            dates.append(cd.timestamp())
        # Onto the next period
        cd = cd + prd
    # print(np.array(dates))
    return np.array(dates)


# Given a date, returns the previous day
#
# startDate:     The start date as a datetime object
# weekends:      True if weekends should counted; false otherwise
def DatePrevDay(startDate, weekends=True):
    # One day
    day = timedelta(minutes=1)
    cd = datetime.fromtimestamp(startDate)
    while (True):
        cd = cd - day
        if (weekends or (cd.date().weekday() != 5 and cd.date().weekday() != 6)):
            return cd.timestamp()
    # Should never happen
    return None


# Load data from the CSV file. Note: Some systems are unable
# to give timestamps for dates before 1970. This function may
# fail on such systems.
#
# path:      The path to the file
# return:    A data frame with the parsed timestamps
def ParseData(path):
    # Read the csv file into a dataframe
    df = pd.read_csv(path)
    # Get the date strings from the date column
    dateStr = df['Date'].values
    D = np.zeros(dateStr.shape)
    # Convert all date strings to a numeric value
    for i, j in enumerate(dateStr):
        # Date strings are of the form year-month-day
        D[i] = datetime.strptime(j, '%Y-%m-%d %H:%M:%S%z').replace(tzinfo=None).timestamp()
    # Add the newly parsed column to the dataframe
    df['Timestamp'] = D
    # Remove any unused columns (axis = 1 specifies fields are columns)
    return df.drop('Date', axis=1)


# Given dataframe from ParseData
# plot it to the screen
#
# df:        Dataframe returned from
# p:         The position of the predicted data points
def PlotData(df, p=None):
    if (p is None):
        p = np.array([])
    # Timestamp data
    ts = df.Timestamp.values
    # Number of x tick marks
    nTicks = 10
    # Left most x value
    s = np.min(ts)
    # Right most x value
    e = np.max(ts)
    # Total range of x values
    r = e - s
    # Add some buffer on both sides
    s -= r / 5
    e += r / 5
    # These will be the tick locations on the x axis
    tickMarks = np.arange(s, e, (e - s) / nTicks)
    # Convert timestamps to strings
    strTs = [datetime.fromtimestamp(i).strftime('%Y-%m-%d %H:%M:%S') for i in tickMarks]
    mpl.figure()
    # Plots of the high and low values for the day
    mpl.plot(ts, df.High.values, color='#727272', linewidth=1.618, label='Actual')
    # Predicted data was also provided
    if (len(p) > 0):
        mpl.plot(ts[p], df.High.values[p], color='#7294AA', linewidth=1.618, label='Predicted')
    # Set the tick marks
    mpl.xticks(tickMarks, strTs, rotation='vertical')
    # Set y-axis label
    mpl.ylabel('Stock High Value (USD)')
    # Add the label in the upper left
    mpl.legend(loc='upper left')
    mpl.show()


# A class that predicts stock prices based on historical stock data
class StockPredictor:
    # The (scaled) data frame
    D = None
    # Unscaled timestamp data
    DTS = None
    # The data matrix
    A = None
    # Target value matrix
    y = None
    # Corresponding columns for target values
    targCols = None
    # Number of previous days of data to use
    npd = 1
    # The regressor model
    R = None
    # Object to scale input data
    S = None

    # Constructor
    # nPrevDays:     The number of past days to include
    #               in a sample.
    # rmodel:        The regressor model to use (sklearn)
    # nPastDays:     The number of past days in each feature
    # scaler:        The scaler object used to scale the data (sklearn)
    def __init__(self, rmodel, nPastDays=1, scaler=StandardScaler()):
        self.npd = nPastDays
        self.R = rmodel
        self.S = scaler

    # Extracts features from stock market data
    #
    # D:         A dataframe from ParseData
    # ret:       The data matrix of samples
    def _ExtractFeat(self, D):
        # One row per day of stock data
        m = D.shape[0]
        # Open, High, Low, and Close for past n days + timestamp and volume
        n = self._GetNumFeatures()
        B = np.zeros([m, n])
        # Preserve order of spreadsheet
        for i in range(m - 1, -1, -1):
            self._GetSample(B[i], i, D)
        # Return the internal numpy array
        return B

    # Extracts the target values from stock market data
    #
    # D:         A dataframe from ParseData
    # ret:       The data matrix of targets and the

    def _ExtractTarg(self, D):
        # Timestamp column is not predicted
        tmp = D.drop('Timestamp', axis=1)
        # Return the internal numpy array
        return tmp.values, tmp.columns

    # Get the number of features in the data matrix
    #
    # n:         The number of previous days to include
    #           self.npd is  used if n is None
    # ret:       The number of features in the data matrix
    def _GetNumFeatures(self, n=None):
        if (n is None):
            n = self.npd
        return n * 7 + 1

    # Get the sample for a specific row in the dataframe.
    # A sample consists of the current timestamp and the data from
    # the past n rows of the dataframe
    #
    # r:         The array to fill with data
    # i:         The index of the row for which to build a sample
    # df:        The dataframe to use
    # return;    r
    def _GetSample(self, r, i, df):
        # First value is the timestamp
        r[0] = df['Timestamp'].values[i]
        # The number of columns in df
        n = df.shape[1]
        # The last valid index
        lim = df.shape[0]
        # Each sample contains the past n days of stock data; for non-existing data
        # repeat last available sample
        # Format of row:
        # Timestamp Volume Open[i] High[i] ... Open[i-1] High[i-1]... etc
        for j in range(0, self.npd):
            # Subsequent rows contain older data in the spreadsheet
            ind = i + j + 1
            # If there is no older data, duplicate the oldest available values
            if (ind >= lim):
                ind = lim - 1
            # Add all columns from row[ind]
            for k, c in enumerate(df.columns):
                # + 1 is needed as timestamp is at index 0
                r[k + 1 + n * j] = df[c].values[ind]
        return r

    # Attempts to learn the stock market data
    # given a dataframe taken from ParseData
    #
    # D:         A dataframe from ParseData
    def Learn(self, D):
        # Keep track of the currently learned data
        self.D = D.copy()
        # Keep track of old timestamps for indexing
        self.DTS = np.copy(D.Timestamp.values)
        # Scale the data
        self.D[self.D.columns] = self.S.fit_transform(self.D)
        # Get features from the data frame
        self.A = self._ExtractFeat(self.D)
        # Get the target values and their corresponding column names
        self.y, self.targCols = self._ExtractTarg(self.D)
        # Create the regressor model and fit it
        self.R.fit(self.A, self.y)

    # Predicts values for each row of the dataframe. Can be used to
    # estimate performance of the model
    #
    # df:            The dataframe for which to make prediction
    # return:        A dataframe containing the predictions
    def PredictDF(self, df):
        # Make a local copy to prevent modifying df
        D = df.copy()
        # Scale the input data like the training data
        D[D.columns] = self.S.transform()
        # Get features
        A = self._ExtractFeat(D)
        # Construct a dataframe to contain the predictions
        # Column order was saved earlier
        P = pd.DataFrame(index=range(A.shape[0]), columns=self.targCols)
        # Perform prediction
        P[P.columns] = self.R.predict(A)
        # Add the timestamp (already scaled from above)
        P['Timestamp'] = D['Timestamp'].values
        # Scale the data back to original range
        P[P.columns] = self.S.inverse_transform(P)
        return P

    # Predict the stock price during a specified time
    #
    # startDate:     The start date as a string in yyyy-mm-dd format
    # endDate:       The end date as a string yyyy-mm-dd format
    # period:		'daily', 'weekly', or 'monthly' for the time period
    #				between predictions
    # return:        A dataframe containing the predictions or
    def PredictDate(self, startDate, endDate, period='minute'):
        # Create the range of timestamps and reverse them
        ts = DateRange(startDate, endDate, period)[::-1]
        m = ts.shape[0]
        # Prediction is based on data prior to start date
        # Get timestamp of previous day
        prevts = DatePrevDay(ts[-1])
        # Test if there is enough data to continue
        try:
            ind = np.where(self.DTS == prevts)[0][0]
        except IndexError:
            return None
        # There is enough data to perform prediction; allocate new data frame
        P = pd.DataFrame(np.zeros([m, self.D.shape[1]]), index=range(m), columns=self.D.columns)
        # Add in the timestamp column so that it can be scaled properly
        P['Timestamp'] = ts
        # Scale the timestamp (other fields are 0)
        P[P.columns] = self.S.transform(P)
        # B is to be the data matrix of features
        B = np.zeros([1, self._GetNumFeatures()])
        # Add extra last entries for past existing data
        for i in range(self.npd):
            # If the current index does not exist, repeat the last valid data
            curInd = ind + i
            if (curInd >= self.D.shape[0]):
                curInd = curInd - 1
            # Copy over the past data (already scaled)
            P.loc[m + i] = self.D.loc[curInd]
        # Loop until end date is reached
        for i in range(m - 1, -1, -1):
            # Create one sample
            self._GetSample(B[0], i, P)
            # Predict the row of the dataframe and save it
            pred = self.R.predict(B).ravel()
            # Fill in the remaining fields into the respective columns
            for j, k in zip(self.targCols, pred):
                P.at[i, j] = k
        # Discard extra rows needed for prediction
        P = P[0:m]
        # Scale the dataframe back to the original range
        P[P.columns] = self.S.inverse_transform(P)
        # print(P)
        return P

    # Test the predictors performance and
    # displays results to the screen
    #
    # D:             The dataframe for which to make prediction
    def TestPerformance(self, df=None):
        # If no dataframe is provided, use the currently learned one
        if (df is None):
            D = self.D
        else:
            D = self.S.transform(df.copy())
        # Get features from the data frame
        A = self._ExtractFeat(D)
        # Get the target values and their corresponding column names
        y, _ = self._ExtractTarg(D)
        # Begin cross validation
        ss = ShuffleSplit(n_splits=1)
        for trn, tst in ss.split(A):
            s1 = cross_val_score(self.R, A, y, cv=3, scoring=make_scorer(r2_score))
            s2 = cross_val_score(self.R, A[tst], y[tst], cv=3, scoring=make_scorer(r2_score))
            s3 = cross_val_score(self.R, A[trn], y[trn], cv=3, scoring=make_scorer(r2_score))
            print('C-V:\t' + str(s1) + '\nTst:\t' + str(s2) + '\nTrn:\t' + str(s3))