import numpy as np import pandas as pd import matplotlib.pyplot as pyplot from datetime import timedelta from scipy import stats from IPython.display import display """ Event study to evaluate events or signals. The main parameters the event study function accepts are a pandas dataframe containing the price data of the observed projects and the benchmark (data) and dataframe containing the events(ev_data) that contains the data of occurance in the index and the name of the project for every date. """ FIGURE_WIDTH = 20 FIGURE_HEIGHT = 7 FIGURE_SIZE = [FIGURE_WIDTH, FIGURE_HEIGHT] def get_close_price(data, sid, current_date, day_number): # If we're looking at day 0 just return the indexed date if day_number == 0: return data.loc[current_date][sid] # Find the close price day_number away from the current_date else: # If the close price is too far ahead, just get the last available total_date_index_length = len(data.index) # Find the closest date to the target date date_index = data.index.searchsorted(current_date + timedelta(day_number)) # If the closest date is too far ahead, reset to the latest date possible date_index = total_date_index_length - 1 if date_index >= total_date_index_length else date_index # Use the index to return a close price that matches return data.iloc[date_index][sid] def get_first_price(data, starting_point, sid, date): starting_day = date - timedelta(starting_point) date_index = data.index.searchsorted(starting_day) return data.iloc[date_index][sid] def remove_outliers(returns, num_std_devs): return returns[~((returns-returns.mean()).abs() > num_std_devs*returns.std())] def get_returns(data, starting_point, sid, date, day_num): first_price = get_first_price(data, starting_point, sid, date) close_price = get_close_price(data, sid, date, day_num) if first_price == 0: return 0 return (close_price - first_price) / (first_price + 0.0) def calc_beta(stock, benchmark, price_history): """ Calculate beta amounts for each security """ stock_prices = price_history[stock].pct_change().dropna() bench_prices = price_history[benchmark].pct_change().dropna() aligned_prices = bench_prices.align(stock_prices, join='inner') bench_prices = aligned_prices[0] stock_prices = aligned_prices[1] bench_prices = np.array(bench_prices.values) stock_prices = np.array(stock_prices.values) bench_prices = np.reshape(bench_prices, len(bench_prices)) stock_prices = np.reshape(stock_prices, len(stock_prices)) if len(stock_prices) == 0: return None # market_beta, benchmark_beta = np.polyfit(bench_prices, stock_prices, 1) slope, intercept, r_value, p_value, stderr = stats.linregress(bench_prices, stock_prices) return slope def build_x_ticks(day_numbers): return [d for d in day_numbers if d % 2 == 0] def plot_cumulative_returns(returns, x_ticks, events): pyplot.figure(figsize=FIGURE_SIZE) returns.plot(xticks=x_ticks, label="events=%s" % events) pyplot.title("Cumulative Return from Events") pyplot.xlabel("Window Length (t)") pyplot.ylabel("Cumulative Return (r)") pyplot.grid(b=None, which=u'major', axis=u'y') pyplot.legend() def plot_average_returns(returns, benchmark_returns, x_ticks): pyplot.figure(figsize=FIGURE_SIZE) returns.plot(xticks=x_ticks, label="Cumulative Return from Events") benchmark_returns.plot(xticks=x_ticks, label='Benchmark') pyplot.title("Benchmark's average returns around that time to Signals_Events") pyplot.ylabel("% Cumulative Return") pyplot.xlabel("Time Window") pyplot.grid(b=None, which=u'major', axis=u'y') pyplot.legend() def plot_cumulative_abnormal_returns(returns, abnormal_returns, x_ticks): pyplot.figure(figsize=FIGURE_SIZE) returns.plot(xticks=x_ticks, label="Average Cumulative Returns") abnormal_returns.plot(xticks=x_ticks, label="Abnormal Average Cumulative Returns") pyplot.axhline( y=abnormal_returns.loc[0], linestyle='--', color='black', alpha=.3, label='Drift' ) pyplot.axhline( y=abnormal_returns.max(), linestyle='--', color='black', alpha=.3 ) pyplot.title("Cumulative Abnormal Returns versus Cumulative Returns") pyplot.ylabel("% Cumulative Return") pyplot.xlabel("Time Window") pyplot.grid(b=None, which=u'major', axis=u'y') pyplot.legend() def plot_cumulative_return_with_errors(returns, std_devs, events): """ Plotting the same graph but with error bars """ pyplot.figure(figsize=FIGURE_SIZE) pyplot.errorbar(returns.index, returns, xerr=0, yerr=std_devs, label="events=%s" % events) pyplot.grid(b=None, which=u'major', axis=u'y') pyplot.title("Cumulative Return from Events with error") pyplot.xlabel("Window Length (t)") pyplot.ylabel("Cumulative Return (r)") pyplot.legend() pyplot.show() def plot_abnormal_cumulative_return_with_errors(abnormal_volatility, abnormal_returns, events): """ Capturing volatility of abnormal returns """ pyplot.figure(figsize=FIGURE_SIZE) pyplot.errorbar( abnormal_returns.index, abnormal_returns, xerr=0, yerr=abnormal_volatility, label="events=%s" % events ) pyplot.grid(b=None, which=u'major', axis=u'y') pyplot.title("Abnormal Cumulative Return from Events with error") pyplot.xlabel("Window Length (t)") pyplot.ylabel("Cumulative Return (r)") pyplot.legend() pyplot.show() def build_day_numbers(starting_point): """ Create our range of day_numbers that will be used to calculate returns Looking from -starting_point to +starting_point to create timeframe band """ return [i for i in range(-starting_point, starting_point)] def get_price_history(data, date, beta_window, sid, benchmark): """ Create a DataFrame containing the data for the necessary sids within that time frame """ if beta_window: history_index = data.index.searchsorted(date) history_index_start = data.index.searchsorted(data[data[sid] != 0].index[0]) histotical_prices = data.iloc[history_index_start:history_index][[sid, benchmark]] else: history_index = data.index.searchsorted(date) history_index_start = max([history_index - beta_window, 0]) histotical_prices = data.iloc[history_index_start:history_index][[sid, benchmark]] histotical_prices = histotical_prices[histotical_prices[sid] != 0] return histotical_prices[histotical_prices != 0].dropna() def compute_return_matrix(ev_data, data, sample_size, starting_point, day_num, benchmark, returns, benchmark_returns, abnormal_returns, beta_window): """ Computes the returns for the project, benchmark and abnormal """ for date, row in ev_data.iterrows(): sid = row.symbol if date not in data.index or sid not in data.columns: continue if sid == 'ethereum' and benchmark == 'ethereum': benchmark = 'bitcoin' elif sid == 'bitcoin' and benchmark == 'bitcoin': benchmark = 'ethereum' project_return = get_returns(data, starting_point, sid, date, day_num) benchmark_return = get_returns(data, starting_point, benchmark, date, day_num) returns.append(project_return) benchmark_returns.append(benchmark_return) sample_size += 1 beta = calc_beta(sid, benchmark, get_price_history(data, date, beta_window, sid, benchmark)) if beta is None: continue abnormal_return = project_return - (beta * benchmark_return) abnormal_returns.append(abnormal_return) return sample_size def compute_averages(ev_data, data, starting_point, day_numbers, benchmark, all_returns, all_std_devs, total_sample_size, all_benchmark_returns, abnormal_volatility, all_abnormal_returns, beta_window): """ Computes the avegare returns and standards deviation of the events """ for day_num in day_numbers: returns = [] benchmark_returns = [] abnormal_returns = [] sample_size = 0 sample_size = compute_return_matrix(ev_data, data, sample_size, starting_point, day_num, benchmark, returns, benchmark_returns, abnormal_returns, beta_window) returns = pd.Series(returns).dropna() returns = remove_outliers(returns, 2) abnormal_returns = pd.Series(abnormal_returns).dropna() abnormal_returns = remove_outliers(abnormal_returns, 2) all_returns[day_num] = np.average(returns) all_std_devs[day_num] = np.std(returns) total_sample_size[day_num] = sample_size all_benchmark_returns[day_num] = np.average(pd.Series(benchmark_returns).dropna()) abnormal_volatility[day_num] = np.std(abnormal_returns) all_abnormal_returns[day_num] = np.average(abnormal_returns) def clean_data(data, events, starting_point): """ Cleans signals that does not have enough pricing data """ events_df = events.copy(deep=True) events_df['in_pricesdf'] = 0 id = 0 for date, row in events_df.iterrows(): sid = row.symbol if date not in data.index or sid not in data.columns: events_df.iloc[id, -1] = 1 id = id+1 continue event_day = data.index.searchsorted(date) hist_index_start = event_day - starting_point hist_index_end = event_day + starting_point event_window = data.iloc[hist_index_start:hist_index_end][[sid]] if event_window.min()[0] == 0 or len(event_window) == 0: events_df.iloc[id, -1] = 1 id = id+1 return events_df[events_df['in_pricesdf'] == 0] def event_study(data, events, starting_point=30, benchmark='bitcoin', origin_zero=True, beta_window=None): ev_data = clean_data(data, events, starting_point) all_returns = {} all_std_devs = {} all_benchmark_returns = {} all_abnormal_returns = {} abnormal_volatility = {} total_sample_size = {} day_numbers = build_day_numbers(starting_point) compute_averages(ev_data, data, starting_point, day_numbers, benchmark, all_returns, all_std_devs, total_sample_size, all_benchmark_returns, abnormal_volatility, all_abnormal_returns, beta_window) plotting_events(day_numbers, all_returns, all_benchmark_returns, all_abnormal_returns, all_std_devs, abnormal_volatility, total_sample_size, origin_zero) def signals_format(signals, project): """ Returns signals in the needed format. Accepts a column with the signals as boolean values and the projects name as a string """ sign = pd.DataFrame(signals).tz_convert(None) sign.columns = ['symbol'] sign = sign.replace(True, project) events_ = sign[sign["symbol"] == project] return events_ def plotting_events(day_numbers, all_returns, all_benchmark_returns, all_abnormal_returns, all_std_devs, abnormal_volatility, total_sample_size, origin_zero): all_returns = pd.Series(all_returns) all_std_devs = pd.Series(all_std_devs) all_benchmark_returns = pd.Series(all_benchmark_returns) all_abnormal_returns = pd.Series(all_abnormal_returns) abnormal_volatility = pd.Series(abnormal_volatility) events = np.average(pd.Series(total_sample_size)) if origin_zero: all_returns = all_returns - all_returns.loc[0] all_benchmark_returns = all_benchmark_returns - all_benchmark_returns.loc[0] all_abnormal_returns = all_abnormal_returns - all_abnormal_returns.loc[0] all_std_devs = all_std_devs - all_std_devs.loc[0] abnormal_volatility = abnormal_volatility - abnormal_volatility.loc[0] all_std_devs.loc[:-1] = 0 abnormal_volatility.loc[:-1] = 0 x_ticks = build_x_ticks(day_numbers) plot_cumulative_returns( returns=all_returns, x_ticks=x_ticks, events=events ) plot_average_returns( returns=all_returns, benchmark_returns=all_benchmark_returns, x_ticks=x_ticks ) plot_cumulative_abnormal_returns( returns=all_returns, abnormal_returns=all_abnormal_returns, x_ticks=x_ticks ) plot_cumulative_return_with_errors( returns=all_returns, std_devs=all_std_devs, events=events ) plot_abnormal_cumulative_return_with_errors( abnormal_volatility=abnormal_volatility, abnormal_returns=all_abnormal_returns, events=events) def calc_beta_testing(stock, benchmark, price_history): """ Calculate beta and alpha amounts for each security """ # stock_prices = price_history[stock].pct_change().dropna() stock_prices = np.log(1+price_history[stock].pct_change().dropna()) # bench_prices = price_history[benchmark].pct_change().dropna() bench_prices = np.log(1+price_history[benchmark].pct_change().dropna()) aligned_prices = bench_prices.align(stock_prices, join='inner') bench_prices = aligned_prices[0] stock_prices = aligned_prices[1] bench_prices = np.array(bench_prices.values) stock_prices = np.array(stock_prices.values) bench_prices = np.reshape(bench_prices, len(bench_prices)) stock_prices = np.reshape(stock_prices, len(stock_prices)) if len(stock_prices) == 0: return None # market_beta, benchmark_beta = np.polyfit(bench_prices, stock_prices, 1) slope, intercept, r_value, p_value, stderr = stats.linregress(bench_prices, stock_prices) return slope, intercept def compute_beta_alpha(data, ev_data, starting_point, benchmark): """ Includes beta and alpha in the event dataframe """ betas_df = ev_data.copy(deep=True) betas_df['beta'] = 0 betas_df['alpha'] = 0 id = 0 for date, row in betas_df.iterrows(): sid = row.symbol if date not in data.index or sid not in data.columns: continue if sid == 'ethereum' and benchmark == 'ethereum': benchmark = 'bitcoin' elif sid == 'bitcoin' and benchmark == 'bitcoin': benchmark = 'ethereum' beta, alpha = calc_beta_testing(sid, benchmark, get_price_history(data, date, starting_point, sid, benchmark)) betas_df.iloc[id, -2] = beta betas_df.iloc[id, -1] = alpha id = id+1 return betas_df.reset_index() def calculate_ab_returns(returns_df, betas_df, intercept, benchmark): """ Calculate abnormal returns for every project """ ab_returns = pd.DataFrame() for number, dta in betas_df.iterrows(): sid = dta.symbol ind = number alpha = dta.alpha beta = dta.beta if not intercept: ab_returns[ind] = returns_df[sid]-beta*returns_df[benchmark] else: ab_returns[ind] = returns_df[sid]-(alpha+beta*returns_df[benchmark]) return ab_returns.dropna() def ab_returns_matrix(ab_returns, betas_df, starting_point): """ Maps the abnormal returns for every event """ abnormal_returns_df = ab_returns.reset_index() new_sample = {} for number, dta in betas_df.iterrows(): eventdate = dta.datetime sid = number # find specific event row, look where Date is equal to event_date row = abnormal_returns_df.loc[abnormal_returns_df['datetime'] == eventdate] # get index of row index = row.index[0] # select starting_point plus and starting_point minus around that row my_sample = abnormal_returns_df.loc[(index - starting_point):(index + starting_point), sid].reset_index(drop=True) # add to new set new_sample[number] = my_sample return new_sample def calculate_stats(new_sample, starting_point): """ Calculates t statistics for AARs and CAARs """ ev = pd.DataFrame(new_sample) ev.index = ev.index - starting_point # Calulate CARs ev_cumulative = ev.cumsum() # Calculate t statistics for AARs mean_AR = ev.mean(axis=1) std_AR = ev.std(axis=1) results = pd.DataFrame(mean_AR, columns=['AAR']) # results['STD AR'] = std_AR results['t-AAR'] = mean_AR / std_AR results['P-Value t-AAR'] = stats.norm.cdf(results['t-AAR']) # Calculate t statistics for CAARs mean_CAR = ev_cumulative.mean(axis=1) std_CAR = ev_cumulative.std(axis=1) results['CAAR'] = mean_CAR # results['STD CAR'] = std_CAR results['t-CAAR'] = mean_CAR / std_CAR results['P-Value t-CAAR'] = stats.norm.cdf(results['t-CAAR']) return results def plot_ARR_CAAR(results): fig, ax = pyplot.subplots(figsize=FIGURE_SIZE) ax.set_title('ARS vs CARS', fontsize=20) ax.plot(results.index, results['AAR'], color="red", marker="o") ax.set_xlabel("Days", fontsize=14) ax.set_ylabel("AAR", color="red", fontsize=14) ax2 = ax.twinx() ax2.plot(results.index, results['CAAR'], color="blue", marker="o") ax2.set_ylabel("CAAR", color="blue", fontsize=14) pyplot.show() def plot_CI(tstats, pvalues, CI): c = stats.norm().isf((1-CI)/2) fig, ax = pyplot.subplots(nrows=2, figsize=FIGURE_SIZE) ax[0].set_title(tstats.name+' Statistic', fontsize=20) ax[1].set_title('P-Values ', fontsize=20) tstats.plot(ax=ax[0], label=tstats.name) ax[0].axhline(y=c, linestyle='--', color='red', alpha=.9, label='Significance Line') ax[0].axhline(y=-c, linestyle='--', color='red', alpha=.9) ax[0].legend() ax[1].bar(pvalues.index, pvalues, label=pvalues.name) ax[1].axhline(y=(1-CI)/2, linestyle='--', color='red', alpha=.9, label='Significance Line') ax[1].legend() def get_log_returns(data): # Get arithmetic returns arithmetic_returns = data.pct_change() # Transform to log returns arithmetic_returns = 1+arithmetic_returns returns_array = np.log(arithmetic_returns, out=np.zeros_like(arithmetic_returns), where=(arithmetic_returns != 0)) return pd.DataFrame(returns_array, index=data.index, columns=data.columns).fillna(0) def hypothesis_test(data, ev_data, starting_point, benchmark='ethereum', intercept=True, CI=.95): # Drops events with no pricing data cleaned_events = clean_data(data, ev_data, starting_point) # Call function to calculate betas for events betas_df = compute_beta_alpha(data, cleaned_events, starting_point, benchmark) # Get log returns returns_df = get_log_returns(data) # Calculate abnormal returns ab_returns = calculate_ab_returns(returns_df, betas_df, intercept, benchmark) # Maps the abnorml returns for every event new_sample = ab_returns_matrix(ab_returns, betas_df, starting_point) # Calculate Statistics results = calculate_stats(new_sample, starting_point) display(results) # Plotting Functions plot_ARR_CAAR(results) plot_CI(results['t-AAR'], results['P-Value t-AAR'], CI) plot_CI(results['t-CAAR'], results['P-Value t-CAAR'], CI)