HYU-PaulKim / ML_Summary

brief summary of what I've learned so far from univ.
0 stars 0 forks source link

3. Correlation between BTC and other assets #4

Open HYU-PaulKim opened 1 month ago

HYU-PaulKim commented 1 month ago

I calculated correlation between BTC price and crucial financial data such as US Treasury, Chinese Treasury, stock indices, and the dollar index. I used IQR method to eliminate outliers

Calculated correlation: Pearson, Spearman, Time lagged cross correlation

def remove_outlier(df, columns, weight = 2.5): #remove outliers by IQR method
    new_df = df.copy() 
    Q1 = np.percentile(columns.values, 25)
    Q3 = np.percentile(columns.values, 75)
    iqr = Q3 - Q1
    iqr_weight = iqr * weight
    lowest_val = Q1 - iqr_weight
    highest_val = Q3 + iqr_weight
    outlier_index = columns[(columns < lowest_val) | (columns> highest_val)].index 

    ratio=(len(outlier_index)/len(columns)) #ratio of outliers that will be eliminated
    new_df.drop(outlier_index, inplace = True)
    new_df.reset_index(drop = True, inplace = True)
    return new_df,ratio
finance=['DXY','US_3M','US_2Y','US_10Y','XAU','AMC','SOX','NASDAQ','China_2Y','China_10Y','SP500','ARKK','KOSPI','HSI','MSCIEF'] #top 12 coins by market cap
finance_files=[ticker+".csv" for ticker in finance]
outlier_ratio=[None]*len(finance)
BTC_dataframe=pd.read_csv("BTC.csv")
BTC_dataframe=BTC_dataframe.iloc[:2392] #eliminating data before the decision of the CME Group
BTC_dataframe['Change %']=BTC_dataframe['Change %'].str.replace('%','')
BTC_dataframe['Change %']=BTC_dataframe['Change %'].astype(np.float64)
Finance_dataframes=[]
for i in range(0,len(finance)): 
    print(finance[i])
    new_cs=pd.read_csv(finance_files[i])
    new_cs['Change %']=new_cs['Change %'].str.replace('%','') 
    new_cs['Change %']=new_cs['Change %'].str.replace(',','')
    new_cs['Change %']=new_cs['Change %'].astype(np.float64) #modify change % column so we can calculate correlation
    new_cs,outlier_ratio[i]=remove_outlier(new_cs,new_cs['Change %'])
    Finance_dataframes.append(new_cs)
corr_diff_target=[]
result_cleansed=[]
for i in range(0,len(finance)):
    #Merge the two dataframes based on dates.
    corr_diff_target.append(pd.merge(BTC_dataframe,Finance_dataframes[i],how='inner',on='Date'))
    if(corr_diff_target[i]['Change %_y'].dtype!='float64'):
        corr_diff_target[i]['Change %_y']=corr_diff_target[i]['Change %_y'].str.replace('%','')
        corr_diff_target[i]['Change %_y']=corr_diff_target[i]['Change %_y'].str.replace(',','')
        corr_diff_target[i]['Change %_y']=corr_diff_target[i]['Change %_y'].astype(np.float64)
    #Pearson correlation coefficient
    corr_val=corr_diff_target[i]['Change %_x'].corr(corr_diff_target[i]['Change %_y'])
    #Spearman's rank correlation coefficient
    spearman_corr,p_val= spearmanr(corr_diff_target[i]['Change %_x'], corr_diff_target[i]['Change %_y'])
    #Time lagged cross correlation
    data=sm.tsa.stattools.ccf(corr_diff_target[i]['Change %_x'],corr_diff_target[i]['Change %_y'], adjusted=False)
    result_cleansed.append([finance[i],corr_val,spearman_corr,outlier_ratio[i]])
    max_corr = max(abs(corr_val), abs(spearman_corr))
    if any(abs(data) > max_corr):
        print(f"Higher correlation found for {finance[i]} with time lags")
        for idx, val in enumerate(data):
            if ((val > max_corr)&(abs(val)>0.3)): #Significant 
                print(f"Lag {idx + 1}: Correlation = {val}")

result_cleansed=pd.DataFrame(result_cleansed)
result_cleansed.columns=['Ticker','Pearson correlation','Spearman correlation','Outlier_ratio']
result_cleansed=result_cleansed.sort_values(by='Pearson correlation',key=abs,ascending=False) #Show highest correlation
result_cleansed
HYU-PaulKim commented 1 month ago

Result:

Image

Image