Open MARD1NO opened 2 years ago
fp32: ================ Test Evaluation ================ Rank[0], Epoch 7, Step 75000, AUC 0.802074, LogLoss 0.125882, Eval_time 19.48 s, Metrics_time 6.48 s, Eval_samples 89140000, GPU_Memory 15074 MiB, Host_Memory 10726 MiB, 2022-07-27 11:03:38
对应HugeCTR脚本:
import hugectr from mpi4py import MPI data_dir = "/RAID0/liujuncheng/criteo1t_parquet_40M_long" print(f"{data_dir}/train/_file_list.txt") solver = hugectr.CreateSolver(batchsize_eval = 55296,# real value batchsize = 55296, # 55296 or 69120 lr = 0.0025, # 对齐 warmup_steps = 2750, decay_start = 40000, decay_steps = 40000, decay_power = 2.0, end_lr = 1e-6, enable_tf32_compute = True, #use_mixed_precision = True, #scaler = 1024, vvgpu = [[0,1,2,3]], # 8 gpus repeat_dataset = True, use_algorithm_search=False, i64_input_key = True) # in32, False reader = hugectr.DataReaderParams(data_reader_type = hugectr.DataReaderType_t.Parquet, source = [f"{data_dir}/train/_file_list.txt"], eval_source = f"{data_dir}/test/_file_list.txt", slot_size_array = [62774, 8001, 2901, 74279, 7513, 3369, 1392, 21627, 7919, 21, 276, 1231236, 9643, 39873199, 38853, 17240, 7421, 20263, 3, 7103, 1540, 63, 38457188, 2929249, 400771, 10, 2209, 11910, 152, 4, 976, 14, 39976779, 25414584, 39639858, 583095, 12929, 108, 36], # real value check_type = hugectr.Check_t.Non) optimizer = hugectr.CreateOptimizer(optimizer_type = hugectr.Optimizer_t.Adam, update_type = hugectr.Update_t.Local, #有可能会影响性能 beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8) dropout_rate = 0.05 model = hugectr.Model(solver, reader, optimizer) model.add(hugectr.Input(label_dim = 1, label_name = "labels", dense_dim = 0, dense_name = "dense", data_reader_sparse_param_array = [hugectr.DataReaderSparseParam("data1", 2, False, 39)])) # 2 False 的含义 model.add(hugectr.SparseEmbedding(embedding_type = hugectr.Embedding_t.LocalizedSlotSparseEmbeddingHash, #有三种可以选 workspace_size_per_gpu_in_mb = 15000,#bigger enough embedding_vec_size = 16, combiner = "sum", sparse_embedding_name = "sparse_embedding1", bottom_name = "data1", slot_size_array = [62774, 8001, 2901, 74279, 7513, 3369, 1392, 21627, 7919, 21, 276, 1231236, 9643, 39873199, 38853, 17240, 7421, 20263, 3, 7103, 1540, 63, 38457188, 2929249, 400771, 10, 2209, 11910, 152, 4, 976, 14, 39976779, 25414584, 39639858, 583095, 12929, 108, 36], optimizer = optimizer)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Reshape, bottom_names = ["sparse_embedding1"], top_names = ["reshape_sparse_embedding"], leading_dim=16 * 39)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.MultiCross, bottom_names = ["reshape_sparse_embedding"], top_names = ["multicross1"], num_layers=4)) # layer1 model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["reshape_sparse_embedding"], top_names = ["fc1"], num_output=1000)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc1"], top_names = ["relu1"])) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, bottom_names = ["relu1"], top_names = ["dropout1"], dropout_rate=dropout_rate)) # layer2 model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["dropout1"], top_names = ["fc2"], num_output=1000)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc2"], top_names = ["relu2"])) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, bottom_names = ["relu2"], top_names = ["dropout2"], dropout_rate=dropout_rate)) # layer3 model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["dropout2"], top_names = ["fc3"], num_output=1000)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc3"], top_names = ["relu3"])) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, bottom_names = ["relu3"], top_names = ["dropout3"], dropout_rate=dropout_rate)) # layer4 model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["dropout3"], top_names = ["fc4"], num_output=1000)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc4"], top_names = ["relu4"])) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, bottom_names = ["relu4"], top_names = ["dropout4"], dropout_rate=dropout_rate)) # layer5 model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["dropout4"], top_names = ["fc5"], num_output=1000)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.ReLU, bottom_names = ["fc5"], top_names = ["relu5"])) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Dropout, bottom_names = ["relu5"], top_names = ["dropout5"], dropout_rate=dropout_rate)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.Concat, bottom_names = ["dropout5", "multicross1"], top_names = ["concat2"])) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.InnerProduct, bottom_names = ["concat2"], top_names = ["fc6"], num_output=1)) model.add(hugectr.DenseLayer(layer_type = hugectr.Layer_t.BinaryCrossEntropyLoss, bottom_names = ["fc6", "labels"], top_names = ["loss"])) model.compile() model.summary() # model.fit(max_iter = 2300, display = 200, eval_interval = 1000, snapshot = 1000000, snapshot_prefix = "dcn") model.fit(max_iter = 75000, display = 1000, eval_interval = 4999, snapshot = 1000000, snapshot_prefix = "dcn") [HCTR][02:21:10.156][INFO][RK0][main]: Evaluation, AUC: 0.804863 [HCTR][02:21:10.156][INFO][RK0][main]: Eval Time for 100 iters: 3.16018s
对应HugeCTR脚本: