Closed chen849157649 closed 3 years ago
Hello,
You may try this code to preprocess this data.
import random, math, os random.seed(0)
def random_split(inputs, output1, valid): fout1 = open(output1, 'w') fout2 = open(valid, 'w') for ii, line in enumerate(open(inputs)): if ii == 0: continue if random.uniform(0, 1) < 0.9: fout1.write(line) else: fout2.write(line) fout1.close() fout2.close()
def cnt_freq_train(inputs): count_freq = [] for i in range(24): count_freq.append({}) for idx, line in enumerate(open(inputs)): line = line.replace('\n', '').split(',') if idx % 1000000 == 0 and idx > 0: print idx for i in range(24): if i == 1: continue if line[i] not in count_freq[i]: count_freq[i][line[i]] = 0 count_freq[i][line[i]] += 1 return count_freq
def generate_feature_map_and_train_csv(inputs, train_csv, file_feature_map, freq_dict, threshold=4): feature_map = [] for i in range(24): feature_map.append({}) fout = open(train_csv, 'w') for idx, line in enumerate(open(inputs)): line = line.replace('\n', '').split(',') if idx % 1000000 == 0 and idx > 0: print idx output_line = [line[1]] for i in range(24): if i == 1: continue if freq_dict[i][line[i]] < threshold: output_line.append('0') elif line[i] in feature_map[i]: output_line.append(feature_map[i][line[i]]) else: output_line.append(str(len(feature_map[i]) + 1)) feature_map[i][line[i]] = str(len(feature_map[i]) + 1) output_line = ','.join(output_line) fout.write(output_line + '\n')
print("where is feature map 0")
print(feature_map[0])
# write feature_map file
f_map = open(file_feature_map, 'w')
for i in range(24):
#only_one_zero_index = True
'''
print("where is feature map 0")
print(feature_map[0])
'''
for feature in feature_map[i]:
#if feature_map[i][feature] == '0' and only_one_zero_index == False:
# continue
if i == 0:
f_map.write(str(i+1) + ',' + feature + ',' + feature_map[i][feature] + '\n')
elif i > 1:
f_map.write(str(i) + ',' + feature + ',' + feature_map[i][feature] + '\n')
#if only_one_zero_index == True and feature_map[i][feature] == '0':
# only_one_zero_index = False
return feature_map
def generate_valid_csv(inputs, valid_csv, feature_map): fout = open(valid_csv, 'w') for idx, line in enumerate(open(inputs)): line = line.replace('\n', '').split(',') output_line = [line[1]] for i in range(24): if i == 1: continue if line[i] in feature_map[i]: output_line.append(feature_map[i][line[i]]) else: output_line.append('0') output_line = ','.join(output_line) fout.write(output_line + '\n')
print('Split the orignal dataset into train and valid dataset.') random_split('train.txt', 'train1.txt', 'valid.txt')
freq_dict = cnt_freq_train('train.txt')
print('Generate the feature map and impute the training dataset.') feature_map = generate_feature_map_and_train_csv('train1.txt', 'train.csv', 'avazu_feature_map', freq_dict, threshold=8) print('Impute the valid dataset.') generate_valid_csv('valid.txt', 'valid.csv', feature_map) print('Delete unnecessary files') os.system('rm train1.txt valid.txt')
def get_feature_size(fname): cnts = [0] 24 mins = [1] 24 maxs = [1] * 24 dicts = [] for i in range(24): dicts.append(set()) for line in open(fname): line = line.strip().split(',') for i in range(24): if line[i] not in dicts[i]: cnts[i] += 1 dicts[i].add(line[i]) try: mins[i] = min(mins[i], float(line[i])) maxs[i] = max(maxs[i], float(line[i])) except: print line print cnts print mins print maxs
Hope this code is helpful for you. Thanks for your interests.
If you still have questions on the data preprocessing code, please feel free to let me know.
hi, thank you for your share code!, ask your question, how to deal with multi valued category features?such as user interest tags,this source code can not be used