I have written a cpu version of massive regression
from numba import jit
import numpy as np
@jit(nopython=True)
def resid_lstsq(x,y):
ret = np.linalg.lstsq(x,y)
coef = ret[0]
yHat = np.dot(x,coef)
residuls = y - yHat
return residuls
#np.dot((np.dot(np.linalg.inv(np.dot(x.T,x)),x.T)),y)
@jit(nopython=True)
def resid_matrixReverse(x,y):
coef = np.dot(np.dot(np.linalg.inv(np.dot(x.T,x)),x.T),y)
yHat = np.dot(x,coef)
residuls = y - yHat
return residuls
@jit(nopython=True)
def resid_matrixSolve(x,y):
coef = np.linalg.solve(np.dot(x.T, x), np.dot(x.T, y))
yHat = np.dot(x,coef)
residuls = y - yHat
return residuls
def residCalculatorVector(x,y,method='matrixReverse'):
'''
get the residuls of regression for one period
x: the independent variables
y: the dependent variables
'''
if method == 'lstsq':
return resid_lstsq(x,y)
elif method == 'matrixReverse':
return resid_matrixReverse(x,y)
elif method == 'matrixSolve':
return resid_matrixSolve(x,y)
elif method == 'sklearn':
pass
def residCalculatorTensor(newFactorTesor,oldFactorTensor,method='matrixReverse'):
'''
calculate the residuals between one new factor with several old factors for each date and each stocks
newFactorTesor: (i,j,k)(i:stock ID, j: trade date, k:factors+target)
oldFactorTensor (i,j,k)(i:stock ID, j: trade date, k:factors)
return: the residual tensor for each factors and each trade date and each factors
'''
stockNum = newFactorTesor.shape[0] #stock number in the period
trDateNum = newFactorTesor.shape[1] #trade date number
newFacNum = newFactorTesor.shape[2] #new factor number + 1 (plus target)
ResidTensor = np.full(fill_value=np.nan,shape=(stockNum,trDateNum,newFacNum)) #the last column is the residul of target
for factor_i in range(newFacNum):
for trDate_j in range(trDateNum):
y_i_j = newFactorTesor[:,trDate_j,factor_i] #factor_i in date_j for all stocks
x_j = oldFactorTensor[:,trDate_j,:] #old factors in date_j for all stocks
idx = np.where(~np.isnan(y_i_j))[0] #the index which is NAs
y_i_j_ = y_i_j[idx] #rows without NA
x_j_ = x_j[idx] #rows without NA
factorResid_i_j = residCalculatorVector(x_j_,y_i_j_,method=method)
ResidTensor[idx,trDate_j,factor_i] = factorResid_i_j
return ResidTensor
newF_01 = np.random.randn(3000,500,50)
oldFTen_01 = np.random.randn(3000,500,100)
%time ans = residCalculatorTensor(newFactorTesor=newF_01,oldFactorTensor=oldFTen_01,method='matrixSolve')
CPU times: user 21min 18s, sys: 51.2 s, total: 22min 9s
Wall time: 1min 23s
I think the CPU version is too slow for such massive regression ,
And I found that your code contains some function can calculate batch regression by GPU
Can you teach me how to calculate such problem efficiently ??
Thank you so much .
I have written a cpu version of massive regression
I think the CPU version is too slow for such massive regression , And I found that your code contains some function can calculate batch regression by GPU Can you teach me how to calculate such problem efficiently ?? Thank you so much .