# -*- coding: utf-8 -*- # Import necessary modules import numpy as np import numpy.linalg as npla import hoggorm.statTools as st import hoggorm.cross_val as cv class nipalsPCA: """ This class carries out Principal Component Analysis using the NIPALS algorithm. PARAMETERS ---------- arrX : numpy array A numpy array containing the data numComp : int, optional An integer that defines how many components are to be computed Xstand : boolean, optional Defines whether variables in ``arrX`` are to be standardised/scaled or centered False : columns of ``arrX`` are mean centred (default) ``Xstand = False`` True : columns of ``arrX`` are mean centred and devided by their own standard deviation ``Xstand = True`` cvType : list, optional The list defines cross validation settings when computing the PCA model. Note if `cvType` is not provided, cross validation will not be performed and as such cross validation results will not be available. Choose cross validation type from the following: loo : leave one out / a.k.a. full cross validation (default) ``cvType = ["loo"]`` KFold : leave out one fold or segment ``cvType = ["KFold", numFolds]`` numFolds: int Number of folds or segments lolo : leave one label out ``cvType = ["lolo", lablesList]`` lablesList: list Sequence of lables. Must be same lenght as number of rows in ``arrX``. Leaves out objects with same lable. RETURNS ------- class A class that contains the PCA model and computational results EXAMPLES -------- First import the hoggorm package. >>> import hoggorm as ho Import your data into a numpy array. >>> myData array([[ 5.7291665, 3.416667 , 3.175 , 2.6166668, 6.2208333], [ 6.0749993, 2.7416666, 3.6333339, 3.3833334, 6.1708336], [ 6.1166663, 3.4916666, 3.5208333, 2.7125003, 6.1625004], ..., [ 6.3333335, 2.3166668, 4.1249995, 4.3541665, 6.7500005], [ 5.8250003, 4.8291669, 1.4958333, 1.0958334, 6.0999999], [ 5.6499996, 4.6624999, 1.9291668, 1.0749999, 6.0249996]]) >>> np.shape(myData) (14, 5) Examples of how to compute a PCA model using different settings for the input parameters. >>> model = ho.nipalsPCA(arrX=myData, numComp=5, Xstand=False) >>> model = ho.nipalsPCA(arrX=myData) >>> model = ho.nipalsPCA(arrX=myData, numComp=3) >>> model = ho.nipalsPCA(arrX=myData, Xstand=True) >>> model = ho.nipalsPCA(arrX=myData, cvType=["loo"]) >>> model = ho.nipalsPCA(arrX=myData, cvType=["KFold", 4]) >>> model = ho.nipalsPCA(arrX=myData, cvType=["lolo", [1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7]]) Examples of how to extract results from the PCA model. >>> scores = model.X_scores() >>> loadings = model.X_loadings() >>> cumulativeCalibratedExplainedVariance_allVariables = model.X_cumCalExplVar_indVar() """ def __init__(self, arrX, numComp=None, Xstand=False, cvType=None): """ On initialisation check how arrX and arrY are to be pre-processed (Xstand and Ystand are either True or False). Then check whether number of components chosen by user is OK. """ # =============================================================================== # Check what is provided by user # =============================================================================== # Define X and y within class such that the data can be accessed from # all attributes in class. self.arrX_input = arrX # Check whether cvType is provided. If NOT, then no cross validation # is carried out. self.cvType = cvType # Define maximum number of components to compute depending on whether # cross validation was selected or not. if isinstance(self.cvType, type(None)): maxNumPC = min(np.shape(self.arrX_input)) else: # Depict the number of components that are possible to compute based # on size of data set (#rows, #cols), type of cross validation (i.e. # size of CV segments) numObj = np.shape(self.arrX_input)[0] # Compute the sizes of training sets in CV if self.cvType[0] == "loo": cvComb = cv.LeaveOneOut(numObj) elif self.cvType[0] == "KFold": cvComb = cv.KFold(numObj, k=self.cvType[1]) elif self.cvType[0] == "lolo": cvComb = cv.LeaveOneLabelOut(self.cvType[1]) else: print('Requested form of cross validation is not available') pass # First devide into combinations of training and test sets. Collect # sizes of training sets, since this also may limit the number of # components that can be computed. segSizes = [] for train_index, test_index in cvComb: x_train, x_test = cv.split(train_index, test_index, self.arrX_input) segSizes.append(numObj - sum(train_index)) # Compute the max number of components based on only object size maxN = numObj - max(segSizes) - 1 # Choose whatever is smaller, number of variables or maxN maxNumPC = min(np.shape(arrX)[1], maxN) # Now set the number of components that is possible to compute. if numComp is None: self.numPC = maxNumPC else: if numComp > maxNumPC: self.numPC = maxNumPC else: self.numPC = numComp # Pre-process data according to user request. # ------------------------------------------- # Check whether standardisation of X and Y are requested by user. If # NOT, then X and y are centred by default. self.Xstand = Xstand # Standardise X if requested by user, otherwise center X. if self.Xstand: self.Xmeans = np.average(self.arrX_input, axis=0) self.Xstd = np.std(self.arrX_input, axis=0, ddof=1) self.arrX = (self.arrX_input - self.Xmeans) / self.Xstd else: self.Xmeans = np.average(self.arrX_input, axis=0) self.arrX = self.arrX_input - self.Xmeans # Before PLS2 NIPALS algorithm starts initiate and lists in which # results will be stored. self.X_scoresList = [] self.X_loadingsList = [] self.X_loadingsWeightsList = [] self.coeffList = [] self.X_residualsList = [self.arrX] # Collect residual matrices/arrays after each computed component self.resids = {} self.X_residualsDict = {} # Collect predicted matrices/array Xhat after each computed component self.calXhatDict_singPC = {} # Collect explained variance in each component self.calExplainedVariancesDict = {} self.X_calExplainedVariancesList = [] # =============================================================================== # Here the NIPALS PCA algorithm on X starts # =============================================================================== threshold = 1.0e-8 X_new = self.arrX.copy() # Compute number of principal components as specified by user for j in range(self.numPC): # Check if first column contains only zeros. If yes, then # NIPALS will not converge and (npla.norm(num) will contain # nan's). Rather put in other starting values. if not np.any(X_new[:, 0]): X_repl_nonCent = np.arange(np.shape(X_new)[0]) X_repl = X_repl_nonCent - np.mean(X_repl_nonCent) t = X_repl.reshape(-1,1) else: t = X_new[:,0].reshape(-1,1) # Iterate until score vector converges according to threshold while 1: num = np.dot(np.transpose(X_new), t) denom = npla.norm(num) p = num / denom t_new = np.dot(X_new, p) diff = t - t_new t = t_new.copy() SS = np.sum(np.square(diff)) # Check whether sum of squares is smaller than threshold. Break # out of loop if true and start computation of next component. if SS < threshold: self.X_scoresList.append(t) self.X_loadingsList.append(p) break # Peel off information explained by actual component and continue with # decomposition on the residuals (X_new = E). X_old = X_new.copy() Xhat_j = np.dot(t, np.transpose(p)) X_new = X_old - Xhat_j # Store residuals E and Xhat in their dictionaries self.X_residualsDict[j+1] = X_new self.calXhatDict_singPC[j+1] = Xhat_j if self.Xstand: self.calXhatDict_singPC[j+1] = (Xhat_j * self.Xstd) + self.Xmeans else: self.calXhatDict_singPC[j+1] = Xhat_j + self.Xmeans # Collect scores and loadings for the actual component. self.arrT = np.hstack(self.X_scoresList) self.arrP = np.hstack(self.X_loadingsList) # ============================================================================== # From here computation of CALIBRATED explained variance starts # ============================================================================== # ========== COMPUTATIONS FOR X ========== # --------------------------------------------------------------------- # Create a list holding arrays of Xhat predicted calibration after each # component. Xhat is computed with Xhat = T*P' self.calXpredList = [] # Compute Xhat for 1 and more components (cumulatively). for ind in range(1,self.numPC+1): part_arrT = self.arrT[:,0:ind] part_arrP = self.arrP[:,0:ind] predXcal = np.dot(part_arrT, np.transpose(part_arrP)) if self.Xstand: Xhat = (predXcal * self.Xstd) + self.Xmeans else: Xhat = predXcal + self.Xmeans self.calXpredList.append(Xhat) # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Collect all PRESSE for individual variables in a dictionary. # Keys represent number of component. self.PRESSEdict_indVar_X = {} # Compute PRESS for calibration / estimation PRESSE_0_indVar_X = np.sum(np.square(st.center(self.arrX_input)), axis=0) self.PRESSEdict_indVar_X[0] = PRESSE_0_indVar_X # Compute PRESS for each Xhat for 1, 2, 3, etc number of components # and compute explained variance for ind, Xhat in enumerate(self.calXpredList): diffX = self.arrX_input - Xhat PRESSE_indVar_X = np.sum(np.square(diffX), axis=0) self.PRESSEdict_indVar_X[ind+1] = PRESSE_indVar_X # Now store all PRESSE values into an array. Then compute MSEE and # RMSEE. self.PRESSEarr_indVar_X = np.array(list(self.PRESSEdict_indVar_X.values())) self.MSEEarr_indVar_X = self.PRESSEarr_indVar_X / np.shape(self.arrX_input)[0] self.RMSEEarr_indVar_X = np.sqrt(self.MSEEarr_indVar_X) # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Compute explained variance for each variable in X using the # MSEE for each variable. Also collect PRESSE, MSEE, RMSEE in # their respective dictionaries for each variable. Keys represent # now variables and NOT components as above with # self.PRESSEdict_indVar_X self.cumCalExplVarXarr_indVar = np.zeros(np.shape(self.MSEEarr_indVar_X)) MSEE_0_indVar_X = self.MSEEarr_indVar_X[0,:] for ind, MSEE_indVar_X in enumerate(self.MSEEarr_indVar_X): explVar = (MSEE_0_indVar_X - MSEE_indVar_X) / MSEE_0_indVar_X * 100 self.cumCalExplVarXarr_indVar[ind] = explVar self.PRESSE_indVar_X = {} self.MSEE_indVar_X = {} self.RMSEE_indVar_X = {} self.cumCalExplVarX_indVar = {} for ind in range(np.shape(self.PRESSEarr_indVar_X)[1]): self.PRESSE_indVar_X[ind] = self.PRESSEarr_indVar_X[:,ind] self.MSEE_indVar_X[ind] = self.MSEEarr_indVar_X[:,ind] self.RMSEE_indVar_X[ind] = self.RMSEEarr_indVar_X[:,ind] self.cumCalExplVarX_indVar[ind] = self.cumCalExplVarXarr_indVar[:,ind] # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Collect total PRESSE across all variables in a dictionary. Also, # compute total calibrated explained variance in X. self.PRESSE_total_dict_X = {} self.PRESSE_total_list_X = np.sum(self.PRESSEarr_indVar_X, axis=1) for ind, PRESSE_X in enumerate(self.PRESSE_total_list_X): self.PRESSE_total_dict_X[ind] = PRESSE_X # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Collect total MSEE across all variables in a dictionary. Also, # compute total validated explained variance in X. self.MSEE_total_dict_X = {} self.MSEE_total_list_X = np.sum(self.MSEEarr_indVar_X, axis=1) / np.shape(self.arrX_input)[1] MSEE_0_X = self.MSEE_total_list_X[0] # Compute total cumulated calibrated explained variance in X self.XcumCalExplVarList = [] if not self.Xstand: for ind, MSEE_X in enumerate(self.MSEE_total_list_X): perc = (MSEE_0_X - MSEE_X) / MSEE_0_X * 100 self.MSEE_total_dict_X[ind] = MSEE_X self.XcumCalExplVarList.append(perc) else: self.XcumCalExplVarArr = np.average(self.cumCalExplVarXarr_indVar, axis=1) self.XcumCalExplVarList = list(self.XcumCalExplVarArr) # Construct list with total explained variance in X for each component self.XcalExplVarList = [] for ind, item in enumerate(self.XcumCalExplVarList): if ind == len(self.XcumCalExplVarList)-1: break explVarComp = self.XcumCalExplVarList[ind+1] - self.XcumCalExplVarList[ind] self.XcalExplVarList.append(explVarComp) # Construct a dictionary that holds predicted X (Xhat) from calibration # for each number of components. self.calXpredDict = {} for ind, item in enumerate(self.calXpredList): self.calXpredDict[ind+1] = item # --------------------------------------------------------------------- # --------------------------------------------------------------------- # Compute total RMSEE and store values in a dictionary and list. self.RMSEE_total_dict_X = {} self.RMSEE_total_list_X = np.sqrt(self.MSEE_total_list_X) for ind, RMSEE_X in enumerate(self.RMSEE_total_list_X): self.RMSEE_total_dict_X[ind] = RMSEE_X # --------------------------------------------------------------------- # ============================================================================== # From here cross validation procedure starts # ============================================================================== if self.cvType is not None: numObj = np.shape(self.arrX)[0] if self.cvType[0] == "loo": print("loo") cvComb = cv.LeaveOneOut(numObj) elif self.cvType[0] == "KFold": print("KFold") cvComb = cv.KFold(numObj, k=self.cvType[1]) elif self.cvType[0] == "lolo": print("lolo") cvComb = cv.LeaveOneLabelOut(self.cvType[1]) else: print('Requested form of cross validation is not available') # Collect predicted x (i.e. xhat) for each CV segment in a # dictionary according to number of component self.valXpredDict = {} for ind in range(1, self.numPC+1): self.valXpredDict[ind] = np.zeros(np.shape(self.arrX_input)) # Collect: validation X scores T, validation X loadings P, # validation Y scores U, validation Y loadings Q, # validation X loading weights W and scores regression coefficients C # in lists for each component self.val_arrTlist = [] self.val_arrPlist = [] self.val_arrQlist = [] # Collect train and test set in a dictionary for each component self.cvTrainAndTestDataList = [] self.X_train_means_arr = np.zeros(np.shape(self.arrX_input)) # First devide into combinations of training and test sets for train_index, test_index in cvComb: X_train, X_test = cv.split(train_index, test_index, self.arrX_input) subDict = {} subDict['x train'] = X_train subDict['x test'] = X_test self.cvTrainAndTestDataList.append(subDict) # ------------------------------------------------------------- # Center or standardise X according to users choice if self.Xstand: X_train_mean = np.average(X_train, axis=0).reshape(1,-1) X_train_std = np.std(X_train, axis=0, ddof=1).reshape(1,-1) X_train_proc = (X_train - X_train_mean) / X_train_std # Standardise X test using mean and STD from training set X_test_proc = (X_test - X_train_mean) / X_train_std else: X_train_mean = np.average(X_train, axis=0).reshape(1,-1) X_train_proc = X_train - X_train_mean # Center X test using mean from training set X_test_proc = X_test - X_train_mean # ------------------------------------------------------------- self.X_train_means_arr[test_index,] = X_train_mean # Here the NIPALS PCA algorithm starts # ------------------------------------ threshold = 1.0e-8 X_new = X_train_proc.copy() # Collect scores and loadings in lists that will be later converted # to arrays. scoresList = [] loadingsList = [] # Compute number of principal components as specified by user for j in range(self.numPC): # Check if first column contains only zeros. If yes, then # NIPALS will not converge and (npla.norm(num) will contain # nan's). Rather put in other starting values. if not np.any(X_new[:, 0]): X_repl_nonCent = np.arange(np.shape(X_new)[0]) X_repl = X_repl_nonCent - np.mean(X_repl_nonCent) t = X_repl.reshape(-1,1) else: t = X_new[:,0].reshape(-1,1) # Iterate until score vector converges according to threshold while 1: num = np.dot(np.transpose(X_new), t) denom = npla.norm(num) p = num / denom t_new = np.dot(X_new, p) diff = t - t_new t = t_new.copy() SS = np.sum(np.square(diff)) # Check whether sum of squares is smaller than threshold. Break # out of loop if true and start computation of next component. if SS < threshold: scoresList.append(t) loadingsList.append(p) break # Peel off information explained by actual component and continue with # decomposition on the residuals (X_new = E). X_old = X_new.copy() Xhat_j = np.dot(t, np.transpose(p)) X_new = X_old - Xhat_j # Collect X scores and X loadings for the actual component. valT = np.hstack(scoresList) valP = np.hstack(loadingsList) self.val_arrTlist.append(valT) self.val_arrPlist.append(valP) # Compute the scores for the left out object projT = np.dot(X_test_proc, valP) dims = np.shape(projT)[1] # Construct validated predicted X first for one component, # then two, three, etc for ind in range(0, dims): part_projT = projT[:, 0:ind+1] part_valP = valP[:, 0:ind+1] valPredX_proc = np.dot(part_projT, np.transpose(part_valP)) # Depending on preprocessing re-process in same manner # in order to get values that compare to original values. if self.Xstand: valPredX = (valPredX_proc * X_train_std) + X_train_mean else: valPredX = valPredX_proc + X_train_mean self.valXpredDict[ind+1][test_index, :] = valPredX # Put all predicitons into an array that corresponds to the # original array self.valXpredList = [] valPreds = self.valXpredDict.values() for preds in valPreds: pc_arr = np.vstack(preds) self.valXpredList.append(pc_arr) # ============================================================================== # From here VALIDATED explained variance is computed # ============================================================================== # ========== Computations for X ========== # ----------------------------------------------------------------- # Compute PRESSCV (PRediction Error Sum of Squares) for cross # validation self.valXpredList = self.valXpredDict.values() # Collect all PRESSCV in a dictionary. Keys represent number of # component. self.PRESSCVdict_indVar_X = {} # First compute PRESSCV for zero components self.PRESSCV_0_indVar_X = np.sum(np.square(self.arrX_input - self.X_train_means_arr), axis=0) self.PRESSCVdict_indVar_X[0] = self.PRESSCV_0_indVar_X # Compute PRESSCV for each Yhat for 1, 2, 3, etc number of # components and compute explained variance for ind, Xhat in enumerate(self.valXpredList): # diffX = self.arrX_input - Xhat diffX = self.arrX_input - Xhat PRESSCV_indVar_X = np.sum(np.square(diffX), axis=0) self.PRESSCVdict_indVar_X[ind+1] = PRESSCV_indVar_X # Now store all PRESSCV values into an array. Then compute MSECV # and RMSECV. self.PRESSCVarr_indVar_X = np.array(list(self.PRESSCVdict_indVar_X.values())) self.MSECVarr_indVar_X = self.PRESSCVarr_indVar_X / np.shape(self.arrX_input)[0] self.RMSECVarr_indVar_X = np.sqrt(self.MSECVarr_indVar_X) # ----------------------------------------------------------------- # ----------------------------------------------------------------- # Compute explained variance for each variable in X using the # MSEP for each variable. Also collect PRESSCV, MSECV, RMSECV in # their respective dictionaries for each variable. Keys represent # now variables and NOT components as above with # self.PRESSCVdict_indVar self.cumValExplVarXarr_indVar = np.zeros(np.shape(self.MSECVarr_indVar_X)) MSECV_0_indVar_X = self.MSECVarr_indVar_X[0,:] for ind, MSECV_indVar_X in enumerate(self.MSECVarr_indVar_X): explVar = (MSECV_0_indVar_X - MSECV_indVar_X) / MSECV_0_indVar_X * 100 self.cumValExplVarXarr_indVar[ind] = explVar self.PRESSCV_indVar_X = {} self.MSECV_indVar_X = {} self.RMSECV_indVar_X = {} self.cumValExplVarX_indVar = {} for ind in range(np.shape(self.PRESSCVarr_indVar_X)[1]): self.PRESSCV_indVar_X[ind] = self.PRESSCVarr_indVar_X[:,ind] self.MSECV_indVar_X[ind] = self.MSECVarr_indVar_X[:,ind] self.RMSECV_indVar_X[ind] = self.RMSECVarr_indVar_X[:,ind] self.cumValExplVarX_indVar[ind] = self.cumValExplVarXarr_indVar[:,ind] # ----------------------------------------------------------------- # ----------------------------------------------------------------- # Collect total PRESSCV across all variables in a dictionary. self.PRESSCV_total_dict_X = {} self.PRESSCV_total_list_X = np.sum(self.PRESSCVarr_indVar_X, axis=1) for ind, PRESSCV_X in enumerate(self.PRESSCV_total_list_X): self.PRESSCV_total_dict_X[ind] = PRESSCV_X # ----------------------------------------------------------------- # ----------------------------------------------------------------- # Collect total MSECV across all variables in a dictionary. Also, # compute total validated explained variance in X. self.MSECV_total_dict_X = {} self.MSECV_total_list_X = np.sum(self.MSECVarr_indVar_X, axis=1) / np.shape(self.arrX_input)[1] MSECV_0_X = self.MSECV_total_list_X[0] # Compute total validated explained variance in X self.XcumValExplVarList = [] if not self.Xstand: for ind, MSECV_X in enumerate(self.MSECV_total_list_X): perc = (MSECV_0_X - MSECV_X) / MSECV_0_X * 100 self.MSECV_total_dict_X[ind] = MSECV_X self.XcumValExplVarList.append(perc) else: self.XcumValExplVarArr = np.average(self.cumValExplVarXarr_indVar, axis=1) self.XcumValExplVarList = list(self.XcumValExplVarArr) # Construct list with total validated explained variance in X in # each component self.XvalExplVarList = [] for ind, item in enumerate(self.XcumValExplVarList): if ind == len(self.XcumValExplVarList)-1: break explVarComp = self.XcumValExplVarList[ind+1] - self.XcumValExplVarList[ind] self.XvalExplVarList.append(explVarComp) # ----------------------------------------------------------------- # ----------------------------------------------------------------- # Compute total RMSECV and store values in a dictionary and list. self.RMSECV_total_dict_X = {} self.RMSECV_total_list_X = np.sqrt(self.MSECV_total_list_X) for ind, RMSECV_X in enumerate(self.RMSECV_total_list_X): self.RMSECV_total_dict_X[ind] = RMSECV_X # ----------------------------------------------------------------- def modelSettings(self): """ Returns a dictionary holding the settings under which NIPALS PCA was run. """ # Collect settings under which PCA was run. self.settings = {} self.settings['numComp'] = self.numPC self.settings['Xstand'] = self.Xstand self.settings['arrX'] = self.arrX_input self.settings['analysed arrX'] = self.arrX return self.settings def X_means(self): """ Returns array holding the column means of input array X. """ return self.Xmeans.reshape(1,-1) def X_scores(self): """ Returns array holding scores T. First column holds scores for component 1, second column holds scores for component 2, etc. """ return self.arrT def X_loadings(self): """ Returns array holding loadings P of array X. Rows represent variables and columns represent components. First column holds loadings for component 1, second column holds scores for component 2, etc. """ return self.arrP def X_corrLoadings(self): """ Returns array holding correlation loadings of array X. First column holds correlation loadings for component 1, second column holds correlation loadings for component 2, etc. """ # Creates empty matrix for correlation loadings arr_corrLoadings = np.zeros((np.shape(self.arrT)[1], np.shape(self.arrP)[0]), float) # Compute correlation loadings: # For each component in score matrix for PC in range(np.shape(self.arrT)[1]): PCscores = self.arrT[:, PC] # For each variable/attribute in original matrix (not meancentered) for var in range(np.shape(self.arrX)[1]): origVar = self.arrX[:, var] corrs = np.corrcoef(PCscores, origVar) arr_corrLoadings[PC, var] = corrs[0, 1] self.arr_corrLoadings = np.transpose(arr_corrLoadings) return self.arr_corrLoadings def X_residuals(self): """ Returns a dictionary holding arrays of residuals for array X after each computed component. Dictionary key represents order of component. """ return self.X_residualsDict def X_calExplVar(self): """ Returns a list holding the calibrated explained variance for each component. First number in list is for component 1, second number for component 2, etc. """ return self.XcalExplVarList def X_cumCalExplVar_indVar(self): """ Returns an array holding the cumulative calibrated explained variance for each variable in X after each component. First row represents zero components, second row represents one component, third row represents two components, etc. Columns represent variables. """ return self.cumCalExplVarXarr_indVar def X_cumCalExplVar(self): """ Returns a list holding the cumulative validated explained variance for array X after each component. First number represents zero components, second number represents component 1, etc. """ return self.XcumCalExplVarList def X_predCal(self): """ Returns a dictionary holding the predicted arrays Xhat from calibration after each computed component. Dictionary key represents order of component. """ return self.calXpredDict def X_PRESSE_indVar(self): """ Returns array holding PRESSE for each individual variable in X acquired through calibration after each computed component. First row is PRESSE for zero components, second row for component 1, third row for component 2, etc. """ return self.PRESSEarr_indVar_X def X_PRESSE(self): """ Returns array holding PRESSE across all variables in X acquired through calibration after each computed component. First row is PRESSE for zero components, second row for component 1, third row for component 2, etc. """ return self.PRESSE_total_list_X def X_MSEE_indVar(self): """ Returns an array holding MSEE for each variable in array X acquired through calibration after each computed component. First row holds MSEE for zero components, second row for component 1, third row for component 2, etc. """ return self.MSEEarr_indVar_X def X_MSEE(self): """ Returns an array holding MSEE across all variables in X acquired through calibration after each computed component. First row is MSEE for zero components, second row for component 1, third row for component 2, etc. """ return self.MSEE_total_list_X def X_RMSEE_indVar(self): """ Returns an array holding RMSEE for each variable in array X acquired through calibration after each components. First row holds RMSEE for zero components, second row for component 1, third row for component 2, etc. """ return self.RMSEEarr_indVar_X def X_RMSEE(self): """ Returns an array holding RMSEE across all variables in X acquired through calibration after each computed component. First row is RMSEE for zero components, second row for component 1, third row for component 2, etc. """ return self.RMSEE_total_list_X def X_valExplVar(self): """ Returns a list holding the validated explained variance for X after each component. First number in list is for component 1, second number for component 2, third number for component 3, etc. """ return self.XvalExplVarList def X_cumValExplVar_indVar(self): """ Returns an array holding the cumulative validated explained variance for each variable in X after each component. First row represents zero components, second row represents component 1, third row for compnent 2, etc. Columns represent variables. """ return self.cumValExplVarXarr_indVar def X_cumValExplVar(self): """ Returns a list holding the cumulative validated explained variance for array X after each component. """ return self.XcumValExplVarList def X_predVal(self): """ Returns a dictionary holding the predicted arrays Xhat from validation after each computed component. Dictionary key represents order of component. """ return self.valXpredDict def X_PRESSCV_indVar(self): """ Returns array holding PRESSEV for each individual variable in X acquired through cross validation after each computed component. First row is PRESSCV for zero components, second row for component 1, third row for component 2, etc. """ return self.PRESSCVarr_indVar_X def X_PRESSCV(self): """ Returns an array holding PRESSCV across all variables in X acquired through cross validation after each computed component. First row is PRESSEV for zero components, second row for component 1, third row for component 2, etc. """ return self.PRESSCV_total_list_X def X_MSECV_indVar(self): """ Returns an arrary holding MSECV for each variable in X acquired through cross validation. First row is MSECV for zero components, second row for component 1, etc. """ return self.MSECVarr_indVar_X def X_MSECV(self): """ Returns an array holding MSECV across all variables in X acquired through cross validation after each computed component. First row is MSECV for zero components, second row for component 1, third row for component 2, etc. """ return self.MSECV_total_list_X def X_RMSECV_indVar(self): """ Returns an arrary holding RMSECV for each variable in X acquired through cross validation after each computed component. First row is RMSECV for zero components, second row for component 1, third row for component 2, etc. """ return self.RMSECVarr_indVar_X def X_RMSECV(self): """ Returns an array holding RMSECV across all variables in X acquired through cross validation after each computed component. First row is RMSECV for zero components, second row for component 1, third row for component 2, etc. """ return self.RMSECV_total_list_X def X_scores_predict(self, Xnew, numComp=None): """ Returns array of X scores from new X data using the exsisting model. Rows represent objects and columns represent components. """ if numComp == None: numComp = self.numPC assert numComp <= self.numPC, ValueError('Maximum numComp = ' + str(self.numPC)) assert numComp > -1, ValueError('numComp must be >= 0') # First pre-process new X data accordingly if self.Xstand: x_new = (Xnew - np.average(self.arrX_input, axis=0)) / np.std(self.arrX_input, ddof=1) else: x_new = (Xnew - np.average(self.arrX_input, axis=0)) # Compute the scores for new object projT = np.dot(x_new, self.arrP[:, 0:numComp]) return projT def cvTrainAndTestData(self): """ Returns a list consisting of dictionaries holding training and test sets. """ return self.cvTrainAndTestDataList def corrLoadingsEllipses(self): """ Returns a dictionary hodling coordinates of ellipses that represent 50% and 100% expl. variance in correlation loadings plot. The coordinates are stored in arrays. """ # Create range for ellipses t = np.arange(0.0, 2*np.pi, 0.01) # Compuing the outer circle (100 % expl. variance) xcords100perc = np.cos(t) ycords100perc = np.sin(t) # Computing inner circle xcords50perc = 0.707 * np.cos(t) ycords50perc = 0.707 * np.sin(t) # Collect ellipse coordinates in dictionary ellipses = {} ellipses['x50perc'] = xcords50perc ellipses['y50perc'] = ycords50perc ellipses['x100perc'] = xcords100perc ellipses['y100perc'] = ycords100perc return ellipses