alibaba / EasyRec

A framework for large scale recommendation algorithms.
Apache License 2.0
1.71k stars 311 forks source link

DSSM model - potential leakage between the sequence feature and the target item #460

Open emreatilgan opened 5 months ago

emreatilgan commented 5 months ago

Hello, I use EasyRec DSSM model config for matching model and split the saved model by https://github.com/alibaba/EasyRec/blob/master/easy_rec/python/tools/split_model_pai.py after train_eval and export. However, the saved user tower model requires product_id which is the feature of the item tower. I tried some combinations of features for the user tower:

  1. No product_id_seq feature = the saved user tower model doesn't require product_id as expected. Normal values for recall_at_k metrics
  2. With product_id_seq feature = the saved user tower model requires product_id in the inference and ~0.99 values for recall_at_k

Question: Using the sequence feature, could the user tower have access to the target item and learn to predict itself?

How can I use the history sequence feature and split the user and item tower successfully?

Notes: I'm using tf1.12 to be able to use split_model_pai.py

Config:

train_input_path: "data/train_processed"
eval_input_path: "data/test_processed"
model_dir: "ckpt/dssm_train_processed_negative_sample_earlystop_ckpt"

train_config {
  log_step_count_steps: 100
  optimizer_config: {
    adam_optimizer: {
      learning_rate: {
        exponential_decay_learning_rate {
          initial_learning_rate: 0.001
          decay_steps: 1000
          decay_factor: 0.5
          min_learning_rate: 0.00001
        }
      }
    }
    use_moving_average: false
  }
  save_checkpoints_steps: 500
  num_steps: 20000
}

eval_config {
  metrics_set {
    recall_at_topk { topk: 10 }
  }
  metrics_set {
    recall_at_topk { topk: 50 }
  }
  metrics_set {
    recall_at_topk { topk: 100 }
  }
}

data_config {
  input_fields {
    input_name:'user_id'
    input_type: STRING
  }
  input_fields {
    input_name:'user_gender'
    input_type: STRING
  }
  input_fields {
    input_name:'product_id_seq'
    input_type: STRING
  }
  input_fields {
    input_name: 'product_id'
    input_type: STRING
  }
  input_fields {
    input_name: 'label'
    input_type: INT32
  }

  label_fields: 'label'
  batch_size: 4096
  num_epochs: 20
  prefetch_size: 32
  input_type: CSVInput
  separator: "\t"

  negative_sampler {
    input_path: 'data/negative_contents_processed'
    num_sample: 1024
    num_eval_sample: 1024
    attr_fields: 'product_id'
    item_id_field: 'product_id'
  }
}

feature_config: {
  features: {
    input_names: 'user_id'
    feature_type: IdFeature
    embedding_dim: 16
    hash_bucket_size: 500000
  }
  features: {
    input_names: 'user_gender'
    feature_type: IdFeature
    embedding_dim: 16
    hash_bucket_size: 10
  }
  features: {
    input_names: 'product_id'
    feature_type: IdFeature
    embedding_dim: 16
    hash_bucket_size: 400000
  }
  features: {
    input_names: 'product_id_seq'
    feature_type: SequenceFeature
    separator: '|'
    hash_bucket_size: 400000
    embedding_dim: 16
  }
}
model_config:{
  model_class: "DSSM"
  feature_groups: {
    group_name: 'user'
    feature_names: 'user_id'
    feature_names: 'user_gender'
    wide_deep:DEEP
    sequence_features: {
      group_name: "seq_fea"
      allow_key_search: true
      need_key_feature:true
      seq_att_map: {
        key: "product_id"
        hist_seq: "product_id_seq"
      }
    }
  }
  feature_groups: {
    group_name: "item"
    feature_names: 'product_id'
    wide_deep:DEEP
  }
  dssm {
    user_tower {
      id: "user_id"
      dnn {
        hidden_units: [256, 128, 64, 32]
      }
    }
    item_tower {
      id: "product_id"
      dnn {
        hidden_units: [256, 128, 64, 32]
      }
    }
    l2_regularization: 1e-6
  }
  loss_type: SOFTMAX_CROSS_ENTROPY
  embedding_regularization: 5e-6
}

export_config {
  exporter_type: "best"
  max_check_steps: 500
  enable_early_stop: true
  best_exporter_metric: "recall@100"
}