Open HYU-PaulKim opened 1 month ago
Step 1:
BTC_chain=pd.read_csv("BTC_onchain.csv")
print(BTC_chain[BTC_chain['datetime']=='2017-10-31'].index)
onchain_trim=BTC_chain.iloc[3223:]
onchain_trim.reset_index(inplace=True)
onchain_trim=onchain_trim.drop(['datetime','index','market_cap_usd','market_price_usd','exchange_volume_usd'],axis=1)
minmax_scaler = MinMaxScaler()
onchain_minmax=minmax_scaler.fit_transform(onchain_trim)
onchain_Train = pd.DataFrame(onchain_minmax)
onchain_Train.columns=['Mempool','transaction_rate','avg_size','avg_confirm_time','hash_rate','difficulty','revenue','total_fee']
BTC_hold=pd.read_csv("BTC_HOLD.csv")
print(BTC_hold[BTC_hold['datetime']=='2017-10-31'].index)
BTC_hold_trim=BTC_hold[2632:]
BTC_hold_trim=BTC_hold_trim.drop(['datetime','market_price_usd','addresses_with_0.01_btc_y'],axis=1)
hold_minmax=minmax_scaler.fit_transform(BTC_hold_trim)
hold_Train=pd.DataFrame(hold_minmax)
hold_Train.columns=['1000>',"100<=<1000","10<=<100","1<=<10","0.01<=<1"]
fundamental=pd.concat([hold_Train,onchain_Train],axis=1)
corrs=fundamental.corr()
unstaked_corrs=corrs.unstack()
sorted_corrs=unstaked_corrs
sorted_corrs=sorted_corrs[((0.8<sorted_corrs)&(sorted_corrs!=1))|((sorted_corrs!=1)&(sorted_corrs<-0.8))]
sorted_corrs=sorted_corrs.sort_values(key=abs,ascending=False)
sorted_corrs
Result:
Decision:
fundamental=fundamental.drop(['0.01<=<1','1<=<10','difficulty','100<=<1000'],axis=1)
fundamental.rename(columns = {"1000>": "over_1000","10<=<100":"between 10 and 100"}, inplace = True)
sns.heatmap(fundamental.corr(), annot=True)
==>
Eliminated a few features to reduce high correlation between variables.
Step 2:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
def remove_outlier(df, columns, weight = 3): #remove outliers by IQR method
new_df = df.copy()
Q1 = np.percentile(columns.values, 25)
Q3 = np.percentile(columns.values, 75)
iqr = Q3 - Q1
iqr_weight = iqr * weight
lowest_val = Q1 - iqr_weight
highest_val = Q3 + iqr_weight
outlier_index = columns[(columns < lowest_val) | (columns> highest_val)].index
new_df.drop(outlier_index, inplace = True)
new_df.reset_index(drop = True, inplace = True)
return new_df,outlier_index
BTC_dataframe=pd.read_csv("BTC.csv")
BTC_dataframe=BTC_dataframe.iloc[259:2392]
BTC_dataframe.reset_index(inplace=True)
BTC_dataframe['Change %']=BTC_dataframe['Change %'].str.replace('%','')
BTC_dataframe['Change %']=BTC_dataframe['Change %'].astype(np.float64)
BTC_dataframe,out_index=remove_outlier(BTC_dataframe, BTC_dataframe['Change %'])
BTC_target=BTC_dataframe['Change %']
maxabs_scaler=MaxAbsScaler()
btc_maxabs=maxabs_scaler.fit_transform((np.array(BTC_target)).reshape(-1,1))
target=pd.DataFrame(btc_maxabs)
target=np.ravel(target)
fundamental.reset_index(inplace=True,drop=True)
fundamental=fundamental.drop(out_index)
models = [
('KNN', KNeighborsRegressor()),
('XGBoost', XGBRegressor(n_estimators=1000)),
('Random Forest', RandomForestRegressor(n_estimators=1000)),
('SVR', SVR()),
('ElasticNet', ElasticNet()),
('MLPRegressor', MLPRegressor()),
]
train_fund,test_fund,train_target,test_target=train_test_split(fundamental, target, test_size=0.2)
model_performances = {}
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
for name, model in models:
print(name)
cv_scores = cross_val_score(model, fundamental, target, cv=kf, scoring='neg_mean_squared_error')
cv_scores = -cv_scores
print(f'Average MSE across {n_folds} folds: {np.mean(cv_scores)}')
model_performances[name] = np.mean(cv_scores)
for name, model in models:
print(name)
model.fit(train_fund, np.ravel(train_target))
predictions = model.predict(test_fund)
mse = mean_squared_error(test_target, predictions)
model_performances[name] = mse
for name, mse in model_performances.items():
print(f'{name}: Mean Squared Error = {mse}')
testment_zeros=[0]*421
testment_averages=[target.mean()]*421
print("MSE of test groups:")
print(mean_squared_error(testment_zeros,test_target))
print(mean_squared_error(testment_averages,test_target))
for name,model in models: #As RandomForest and ElasticNet was slightly better than others...
if(name=='Random Forest'):
print(name)
feature_importance=pd.DataFrame(model.feature_importances_)
cols=fundamental.columns
feature_importance.index=cols
print(feature_importance)
elif(name=='ElasticNet'):
feature_importance=model.coef_
print(feature_importance)
print(f'{name}: intercept_ = {model.intercept_}')
print(f'when mean is {target.mean()}')
Result:
One of the significant attractions of Bitcoin is that transaction data is transparent, openly available to everyone. Because it's possible to know how much Bitcoin each address holds, there arises a question of whether this information can be used for price prediction.
Additionally, I am interested in investigating whether factors such as hash rate, which represents mining difficulty, also influence the price. Therefore, I tried to make machine learning models based on on-chain data only to explain price data of BTC.
Preprocess data to deal with pairs of features with excessively high correlation
Conduct machine learning using several methods and compare it with the control groups.
Control group 1) Data predicting that the expected daily change is equal to the 'average' daily change. Control group 2) Data predicting that the expected daily change is zero.