""" Utilities to help with pandas """ import pandas as pd import datetime import numpy as np from copy import copy from syscore.fileutils import get_filename_for_package from syscore.dateutils import BUSINESS_DAYS_IN_YEAR, time_matches, CALENDAR_DAYS_IN_YEAR, SECONDS_PER_DAY from syscore.objects import _named_object, data_error, arg_not_supplied from sysdata.private_config import get_private_then_default_key_value DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S" def turnover(x, y): """ Gives the turnover of x, once normalised for y Returned in annualised terms Assumes both x and y are daily business days """ if isinstance(y, float) or isinstance(y, int): y = pd.Series([float(y)] * len(x.index), x.index) norm_x = x / y.ffill() avg_daily = float(norm_x.diff().abs().resample("1B").sum().mean()) return avg_daily * BUSINESS_DAYS_IN_YEAR def uniquets(x): """ Makes x unique """ x = x.groupby(level=0).last() return x def df_from_list(data): """ Create a single data frame from list of data frames To preserve a unique time signature we add on 1..2..3... micro seconds to successive elements of the list WARNING: SO THIS METHOD WON'T WORK WITH HIGH FREQUENCY DATA! THIS WILL ALSO DESTROY ANY AUTOCORRELATION PROPERTIES """ if isinstance(data, list): column_names = sorted( set(sum([list(data_item.columns) for data_item in data], []))) # ensure all are properly aligned # note we don't check that all the columns match here new_data = [data_item[column_names] for data_item in data] # add on an offset for (offset_value, data_item) in enumerate(new_data): data_item.index = data_item.index + \ pd.Timedelta("%dus" % offset_value) # pooled # stack everything up new_data = pd.concat(new_data, axis=0) new_data = new_data.sort_index() else: # nothing to do here new_data = copy(data) return new_data def must_haves_from_list(data): must_haves_list = [must_have_item(data_item) for data_item in data] must_haves = list(set(sum(must_haves_list, []))) return must_haves def must_have_item(slice_data): """ Returns the columns of slice_data for which we have at least one non nan value :param slice_data: simData to get correlations from :type slice_data: pd.DataFrame :returns: list of bool >>> """ def _any_data(xseries): data_present = [not np.isnan(x) for x in xseries] return any(data_present) some_data = slice_data.apply(_any_data, axis=0) some_data_flags = list(some_data.values) return some_data_flags def pd_readcsv_frompackage(filename): """ Run pd_readcsv on a file in python :param args: List showing location in project directory of file eg systems, provided, tests.csv :type args: str :returns: pd.DataFrame """ full_filename = get_filename_for_package(filename) return pd_readcsv(full_filename) def pd_readcsv(filename, date_index_name="DATETIME", date_format=DEFAULT_DATE_FORMAT, input_column_mapping = None, skiprows=0, skipfooter=0): """ Reads a pandas data frame, with time index labelled package_name(/path1/path2.., filename :param filename: Filename with extension :type filename: str :param date_index_name: Column name of date index :type date_index_name: list of str :param date_format: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior :type date_format: str :param input_column_mapping: If supplied remaps column names in .csv file :type input_column_mapping: dict or None :param skiprows, skipfooter: passed to pd.read_csv :returns: pd.DataFrame """ ans = pd.read_csv(filename, skiprows=skiprows, skipfooter=skipfooter) ans.index = pd.to_datetime(ans[date_index_name], format=date_format).values del ans[date_index_name] ans.index.name = None if input_column_mapping is None: return ans # Have to remap new_ans = pd.DataFrame(index=ans.index) for new_col_name, old_col_name in input_column_mapping.items(): new_ans[new_col_name] = ans[old_col_name] return new_ans def fix_weights_vs_pdm(weights, pdm): """ Take a matrix of weights and positions/forecasts (pdm) Ensure that the weights in each row add up to 1, for active positions/forecasts (not np.nan values after forward filling) This deals with the problem of different rules and/or instruments having different history :param weights: Weights to :type weights: TxK pd.DataFrame (same columns as weights, perhaps different length) :param pdm: :type pdm: TxK pd.DataFrame (same columns as weights, perhaps different length) :returns: TxK pd.DataFrame of adjusted weights """ # forward fill forecasts/positions pdm_ffill = pdm.ffill() adj_weights = uniquets(weights) # resample weights adj_weights = adj_weights.reindex(pdm_ffill.index, method='ffill') # ensure columns are aligned adj_weights = adj_weights[pdm.columns] # remove weights if nan forecast adj_weights[np.isnan(pdm_ffill)] = 0.0 # change rows so weights add to one def _sum_row_fix(weight_row): swr = sum(weight_row) if swr == 0.0: return weight_row new_weights = weight_row / swr return new_weights adj_weights = adj_weights.apply(_sum_row_fix, 1) return adj_weights def drawdown(x): """ Returns a ts of drawdowns for a time series x :param x: account curve (cumulated returns) :type x: pd.DataFrame or Series :returns: pd.DataFrame or Series """ maxx = x.expanding(min_periods=1).max() return x - maxx def from_dict_of_values_to_df(data_dict, ts_index, columns=None): """ Turn a set of fixed values into a pd.dataframe :param data_dict: A dict of scalars :param ts_index: A timeseries index :param columns: (optional) A list of str to align the column names to [must have entries in data_dict keys] :return: pd.dataframe, column names from data_dict, values repeated scalars """ if columns is None: columns = data_dict.keys() columns_as_list = list(columns) numeric_values = dict([(keyname, [data_dict[keyname]] * len(ts_index)) for keyname in columns_as_list]) pd_dataframe = pd.DataFrame(numeric_values, ts_index) return pd_dataframe def create_arbitrary_pdseries(data_list, date_start=datetime.datetime(1980, 1, 1), freq="B"): """ Return a pandas Series with an arbitrary date index :param data_list: simData :type data_list: list of floats or ints :param date_start: First date to use in index :type date_start: datetime :param freq: Frequency of date index :type freq: str of a type that pd.date_range will recognise :returns: pd.Series (same length as simData) >>> create_arbitrary_pdseries([1,2,3]) 1980-01-01 1 1980-01-02 2 1980-01-03 3 Freq: B, dtype: int64 """ date_index = pd.date_range( start=date_start, periods=len(data_list), freq=freq) pdseries = pd.Series(data_list, index=date_index) return pdseries def dataframe_pad(starting_df, column_list, padwith=0.0): """ Takes a dataframe and adds extra columns if neccessary so we end up with columns named column_list :param starting_df: A pd.dataframe with named columns :param column_list: A list of column names :param padwith: The value to pad missing columns with :return: pd.Dataframe """ def _pad_column(column_name, starting_df, padwith): if column_name in starting_df.columns: return starting_df[column_name] else: return pd.Series([0.0] * len(starting_df.index), starting_df.index) new_data = [ _pad_column(column_name, starting_df, padwith) for column_name in column_list ] new_df = pd.concat(new_data, axis=1) new_df.columns = column_list return new_df status_old_data = object() status_new_data = object() status_merged_data = object() def merge_newer_data(old_data, new_data, check_for_spike=True, column_to_check=arg_not_supplied): """ Merge new data, with old data. Any new data that is older than the newest old data will be ignored If check_for_spike will return data_error if price moves too much on join point :param old_data: pd.Series or DataFrame :param new_data: pd.Series or DataFrame :param check_for_spike: bool :param column_to_check: column name to check for spike :return: pd.Series or DataFrame """ merge_status, first_date_in_new_data, merged_data = merge_newer_data_no_checks(old_data, new_data) # check for spike if check_for_spike: spike_present, _ = spike_check_merged_data(merge_status, first_date_in_new_data, merged_data, column_to_check=column_to_check) if spike_present: return data_error return merged_data def spike_check_merged_data(merge_status, first_date_in_new_data, merged_data, column_to_check=arg_not_supplied): if merge_status is status_old_data: ## No checking return False, None if merge_status is status_new_data: ## check everything first_date_in_new_data = None spike_present, spike_date = _check_for_spike_in_data(merged_data, first_date_in_new_data, column_to_check = column_to_check) return spike_present, spike_date def merge_newer_data_no_checks(old_data, new_data): """ Merge new data, with old data. Any new data that is older than the newest old data will be ignored Also returns status and possibly date of merge :param old_data: pd.Series or DataFrame :param new_data: pd.Series or DataFrame :return: status ,last_date_in_old_data: datetime.datetime, merged_data: pd.Series or DataFrame """ if len(old_data.index)==0: return status_new_data, None, new_data if len(new_data.index)==0: return status_old_data, None, old_data last_date_in_old_data = old_data.index[-1] new_data.sort_index() actually_new_data = new_data[new_data.index > last_date_in_old_data] if len(actually_new_data) == 0: # No additional data return status_old_data, None, old_data first_date_in_new_data = actually_new_data.index[0] merged_data = pd.concat([old_data, actually_new_data], axis=0) merged_data = merged_data.sort_index() # remove duplicates (shouldn't be any, but...) merged_data = merged_data[~merged_data.index.duplicated(keep='first')] return status_merged_data, first_date_in_new_data, merged_data def _check_for_spike_in_data(merged_data, first_date_in_new_data=None, column_to_check=arg_not_supplied): ## Returns tuple bool, logical date of spike (or None) first_spike = _first_spike_in_data(merged_data, first_date_in_new_data, column_to_check=column_to_check) if first_spike is None: spike_exists = False else: spike_exists = True return spike_exists, first_spike def _first_spike_in_data(merged_data, first_date_in_new_data=None, column_to_check=arg_not_supplied): """ Checks to see if any data after last_date_in_old_data has spikes :param merged_data: :return: date if spike, else None """ max_spike = get_private_then_default_key_value("max_price_spike") col_list = getattr(merged_data, "columns", None) if col_list is None: ## already a series data_to_check = merged_data else: if column_to_check is arg_not_supplied: column_to_check = col_list[0] data_to_check = merged_data[column_to_check] ## Calculate the average change per day change_pd = average_change_per_day(data_to_check) ## absolute is what matters abs_change_pd = change_pd.abs() ## hard to know what span to use here as could be daily, intraday or a mixture avg_abs_change = abs_change_pd.ewm(span=500).mean() change_in_avg_units = abs_change_pd / avg_abs_change if first_date_in_new_data is None: ## No merged data so we check it all data_to_check = change_in_avg_units else: data_to_check = change_in_avg_units[first_date_in_new_data:] if any(data_to_check>max_spike): return data_to_check.index[data_to_check > max_spike][0] else: return None def average_change_per_day(data_to_check): data_diff = data_to_check.diff()[1:] index_diff = data_to_check.index[1:] - data_to_check.index[:-1] index_diff_days = [diff.total_seconds()/SECONDS_PER_DAY for diff in index_diff] change_per_day = [diff / (diff_days**.5) for diff, diff_days in zip(data_diff.values, index_diff_days)] change_pd = pd.Series(change_per_day, index=data_to_check.index[1:]) return change_pd def full_merge_of_existing_data(old_data, new_data): """ Merges old data with new data. Any Nan in the existing data will be replaced (be careful!) :param old_data: pd.DataFrame :param new_data: pd.DataFrame :returns: pd.DataFrame """ old_columns = old_data.columns merged_data = {} for colname in old_columns: old_series = copy(old_data[colname]) try: new_series = copy(new_data[colname]) except KeyError: # missing from new data, so we just take the old merged_data[colname] = old_data continue merged_series = full_merge_of_existing_series(old_series, new_series) merged_data[colname] = merged_series merged_data_as_df = pd.DataFrame(merged_data) merged_data_as_df = merged_data_as_df.sort_index() return merged_data_as_df def full_merge_of_existing_series(old_series, new_series): """ Merges old data with new data. Any Nan in the existing data will be replaced (be careful!) :param old_data: pd.Series :param new_data: pd.Series :returns: pd.Series """ if len(old_series)==0: return new_series if len(new_series)==0: return old_series joint_data = pd.concat([old_series, new_series], axis=1) joint_data.columns = ['original', 'new'] # fill to the left joint_data_filled_across = joint_data.bfill(1) merged_data = joint_data_filled_across['original'] return merged_data all_labels_match = _named_object("all labels match") mismatch_on_last_day = _named_object("mismatch_on_last_day") original_index_matches_new = _named_object("original index matches new") def merge_data_series_with_label_column(original_data, new_data, col_names=dict(data='PRICE', label='PRICE_CONTRACT')): """ For two pd.DataFrames with 2 columns, including a label column, update the data when the labels start consistently matching >>> s1=pd.DataFrame(dict(PRICE=[1,2,3,np.nan], PRICE_CONTRACT = ["a", "a", "b", "b"]), index=['a1','a2','a3','a4']) >>> s2=pd.DataFrame(dict(PRICE=[ 7,3,4], PRICE_CONTRACT = [ "b", "b", "b"]), index=['a2','a3','a4']) >>> merge_data_series_with_label_column(s1,s2) PRICE PRICE_CONTRACT a1 1.0 a a2 2.0 a a3 3.0 b a4 4.0 b >>> s2=pd.DataFrame(dict(PRICE=[ 2,5,4], PRICE_CONTRACT = [ "b", "b", "b"]), index=['a2','a3','a4']) >>> merge_data_series_with_label_column(s1,s2) PRICE PRICE_CONTRACT a1 1.0 a a2 2.0 a a3 3.0 b a4 4.0 b >>> s2=pd.DataFrame(dict(PRICE=[ 2,3,np.nan], PRICE_CONTRACT = [ "b", "b", "b"]), index=['a2','a3','a4']) >>> merge_data_series_with_label_column(s1,s2) PRICE PRICE_CONTRACT a1 1.0 a a2 2.0 a a3 3.0 b a4 NaN b >>> s1=pd.DataFrame(dict(PRICE=[1,np.nan,3,np.nan], PRICE_CONTRACT = ["a", "a", "b", "b"]), index=['a1','a2','a3','a4']) >>> s2=pd.DataFrame(dict(PRICE=[ 2, 3,4], PRICE_CONTRACT = [ "a", "b", "b"]), index=['a2','a3','a4']) >>> merge_data_series_with_label_column(s1,s2) PRICE PRICE_CONTRACT a1 1.0 a a2 2.0 a a3 3.0 b a4 4.0 b >>> s1=pd.DataFrame(dict(PRICE=[1,np.nan,np.nan], PRICE_CONTRACT = ["a", "a", "b"]), index=['a1','a2','a3']) >>> s2=pd.DataFrame(dict(PRICE=[ 2, 3,4], PRICE_CONTRACT = [ "b", "b", "b"]), index=['a2','a3','a4']) >>> merge_data_series_with_label_column(s1,s2) PRICE PRICE_CONTRACT a1 1.0 a a2 NaN a a3 3.0 b a4 4.0 b >>> s2=pd.DataFrame(dict(PRICE=[ 2, 3,4], PRICE_CONTRACT = [ "b", "c", "c"]), index=['a2','a3','a4']) >>> merge_data_series_with_label_column(s1,s2) PRICE PRICE_CONTRACT a1 1.0 a a2 NaN a a3 NaN b :param original_data: a pd.DataFrame with two columns, equal to col_names :param new_data: a pd.DataFrame with the same two columns :param col_names: dict of str :return: pd.DataFrame with two columns """ if len(new_data)==0: return original_data if len(original_data)==0: return new_data # From the date after this, can happily merge new and old data match_data = find_dates_when_label_changes(original_data, new_data, col_names=col_names) if match_data is mismatch_on_last_day: # No matching is possible return original_data elif match_data is original_index_matches_new: first_date_after_series_mismatch = original_data.index[0] last_date_when_series_mismatch = original_index_matches_new else: first_date_after_series_mismatch, last_date_when_series_mismatch =match_data # Concat the two price series together, fill to the left # This will replace any NA values in existing prices with new ones label_column = col_names['label'] data_column = col_names['data'] merged_data = full_merge_of_existing_series(original_data[data_column][first_date_after_series_mismatch:], new_data[data_column][first_date_after_series_mismatch:]) labels_in_new_data = new_data[last_date_when_series_mismatch:][label_column] labels_in_old_data = original_data[:first_date_after_series_mismatch][label_column] labels_in_merged_data = pd.concat([labels_in_old_data, labels_in_new_data], axis=0) labels_in_merged_data = labels_in_merged_data .loc[~labels_in_merged_data.index.duplicated(keep='first')] labels_in_merged_data_reindexed = labels_in_merged_data.reindex(merged_data.index) labelled_merged_data = pd.concat([labels_in_merged_data_reindexed, merged_data], axis=1) labelled_merged_data.columns = [label_column, data_column] # for older data, keep older data if last_date_when_series_mismatch is original_index_matches_new: current_and_merged_data = labelled_merged_data else: original_data_to_use = original_data[:last_date_when_series_mismatch] # Merged data is the old data, and then the new data current_and_merged_data = pd.concat([original_data_to_use, labelled_merged_data], axis=0) return current_and_merged_data def find_dates_when_label_changes(original_data, new_data, col_names=dict(data='PRICE', label='PRICE_CONTRACT')): """ For two pd.DataFrames with 2 columns, including a label column, find the date after which the labelling is consistent across columns >>> s1=pd.DataFrame(dict(PRICE=[1,2,3,np.nan], PRICE_CONTRACT = ["a", "a", "b", "b"]), index=['a1','a2','a3','a4']) >>> s2=pd.DataFrame(dict(PRICE=[ 2,3,4], PRICE_CONTRACT = [ "b", "b", "b"]), index=['a2','a3','a4']) >>> find_dates_when_label_changes(s1, s2) ('a3', 'a2') >>> s2=pd.DataFrame(dict(PRICE=[ 2,3,4], PRICE_CONTRACT = [ "a", "b", "b"]), index=['a2','a3','a4']) >>> find_dates_when_label_changes(s1, s2) ('a2', 'a1') >>> s2=pd.DataFrame(dict(PRICE=[ 2,3,4], PRICE_CONTRACT = [ "c", "c", "c"]), index=['a2','a3','a4']) >>> find_dates_when_label_changes(s1, s2) mismatch_on_last_day >>> find_dates_when_label_changes(s1, s1) original index matches new >>> s2=pd.DataFrame(dict(PRICE=[1, 2,3,4], PRICE_CONTRACT = ["a","c", "c", "c"]), index=['a1','a2','a3','a4']) >>> find_dates_when_label_changes(s1, s2) mismatch_on_last_day :param original_data: some data :param new_data: some new data :param col_names: dict of str :return: tuple or object if match didn't work out """ label_column = col_names['label'] joint_labels = pd.concat([original_data[label_column], new_data[label_column]], axis=1) joint_labels.columns = ['current', 'new'] joint_labels = joint_labels.sort_index() new_data_start = new_data.index[0] existing_labels_in_new_period = joint_labels['current'][new_data_start:].ffill() new_labels_in_new_period = joint_labels['new'][new_data_start:].ffill() # Find the last date when the labels didn't match, and the first date after that match_data=\ find_dates_when_series_starts_matching(existing_labels_in_new_period, new_labels_in_new_period) if match_data is mismatch_on_last_day: ## Can't use any of new data return mismatch_on_last_day elif match_data is all_labels_match: ## Can use entire series becuase all match if new_data.index[0] == original_data.index[0]: # They are same size, so have to use whole of original data return original_index_matches_new else: ## All the new data matches first_date_after_series_mismatch = new_data_start last_date_when_series_mismatch = original_data.index[original_data.index < new_data_start][-1] else: first_date_after_series_mismatch, last_date_when_series_mismatch = match_data return first_date_after_series_mismatch, last_date_when_series_mismatch def find_dates_when_series_starts_matching(series1, series2): """ Find the last index value when series1 and series 2 didn't match, and the next index after that series must be matched for index and same length >>> s1=pd.Series(["a", "b", "b", "b"], index=[1,2,3,4]) >>> s2=pd.Series(["c", "b", "b", "b"], index=[1,2,3,4]) >>> find_dates_when_series_starts_matching(s1, s2) (2, 1) >>> s2=pd.Series(["a", "a", "b", "b"], index=[1,2,3,4]) >>> find_dates_when_series_starts_matching(s1, s2) (3, 2) >>> s2=pd.Series(["a", "b", "a", "b"], index=[1,2,3,4]) >>> find_dates_when_series_starts_matching(s1, s2) (4, 3) >>> s2=pd.Series(["a", "b", "b", "b"], index=[1,2,3,4]) >>> find_dates_when_series_starts_matching(s1, s2) all labels match >>> s2=pd.Series(["a", "b", "b", "c"], index=[1,2,3,4]) >>> find_dates_when_series_starts_matching(s1, s2) mismatch_on_last_day :param series1: pd.Series :param series2: pd.Series :return: 2-tuple of index values """ # Data is same length, and timestamp matched, so equality of values is sufficient period_equal = [x == y for x, y in zip(series1.values, series2.values)] if all(period_equal): return all_labels_match if not period_equal[-1]: return mismatch_on_last_day # Want last False value period_equal.reverse() first_false_in_reversed_list = period_equal.index(False) last_true_before_first_false_in_reversed_list = first_false_in_reversed_list - 1 reversed_time_index = series1.index[::-1] last_true_before_first_false_in_reversed_list_date = reversed_time_index[ last_true_before_first_false_in_reversed_list] first_false_in_reversed_list_date = reversed_time_index[first_false_in_reversed_list] first_date_after_series_mismatch = last_true_before_first_false_in_reversed_list_date last_date_when_series_mismatch = first_false_in_reversed_list_date return first_date_after_series_mismatch, last_date_when_series_mismatch def proportion_pd_object_intraday(data, closing_time = pd.DateOffset(hours=23, minutes=0, seconds=0)): """ Return the proportion of intraday data in a pd.Series or DataFrame :param data: the underlying data :param closing_time: the time which we are using as a closing time :return: float, the proportion of the data.index that matches an intraday timestamp So 0 = All daily data, 1= All intraday data """ data_index = data.index length_index = len(data_index) count_matches = [time_matches(index_entry, closing_time) for index_entry in data_index] total_matches = sum(count_matches) proportion_matching_close = float(total_matches) / float(length_index) proportion_intraday = 1 - proportion_matching_close return proportion_intraday def strip_out_intraday(data, closing_time = pd.DateOffset(hours=23, minutes=0, seconds=0)): """ Return a pd.Series or DataFrame with only the times matching closing_time Used when we have a mix of daily and intraday data, where the daily data has been given a nominal timestamp :param data: pd object :param closing_time: pdDateOffset with :return: pd object """ data_index = data.index length_index = len(data_index) daily_matches = [time_matches(index_entry, closing_time) for index_entry in data_index] return data[daily_matches] def minimum_many_years_of_data_in_dataframe(data): years_of_data_dict = how_many_years_of_data_in_dataframe(data) years_of_data_values = years_of_data_dict.values() min_years_of_data = min(years_of_data_values) return min_years_of_data def how_many_years_of_data_in_dataframe(data): """ How many years of non NA data do we have? Assumes daily timestamp :param data: pd.DataFrame with labelled columns :return: dict of floats, """ result_dict = dict(data.apply(how_many_years_of_data_in_pd_series, axis=0)) return result_dict def how_many_years_of_data_in_pd_series(data_series): """ How many years of actual data do we have Assume daily timestamp which is fairly regular :param data_series: :return: float """ first_valid_date = data_series.first_valid_index() last_valid_date = data_series.last_valid_index() date_difference = last_valid_date - first_valid_date date_difference_days = date_difference.days date_difference_years = float(date_difference_days) / CALENDAR_DAYS_IN_YEAR return date_difference_years def check_df_equals(x,y): try: pd.testing.assert_frame_equal(x,y) return True except AssertionError: return False def check_ts_equals(x,y): try: pd.testing.assert_series_equal(x,y, check_names=False) return True except AssertionError: return False def make_df_from_list_of_named_tuple(tuple_class, list_of_tuples): elements = tuple_class._fields dict_of_elements = {} for element_name in elements: this_element_values = [getattr(list_entry, element_name) for list_entry in list_of_tuples] dict_of_elements[element_name] = this_element_values pdf = pd.DataFrame(dict_of_elements) pdf.index = pdf[elements[0]] pdf = pdf.drop(elements[0], axis=1) return pdf if __name__ == '__main__': import doctest doctest.testmod()