Open Kiris-tingna opened 6 years ago
class LINE(object): def __init__(self, graph_edge_num, graph_nodes_num, dimension): self.e = graph_edge_num self.n = graph_nodes_num self.steps_per_epoch = None self.epoch_train_size = None self.dimension = dimension def _generate_batch_train(self, adj_list, graph_nodes_num, graph_edge_num, batch_size, negativeRatio, negative_sampling): # 使用 negative sampling 优化 table_size = 1e8 power = 0.75 sampling_table = None data = np.ones((adj_list.shape[0]), dtype=np.int8) mat = csr_matrix((data, (adj_list[:, 0], adj_list[:, 1])), shape=(graph_nodes_num, graph_nodes_num), dtype=np.int8) batch_size_ones = np.ones((batch_size), dtype=np.int8) nb_train_sample = adj_list.shape[0] index_array = np.arange(nb_train_sample) nb_batch = int(np.ceil(nb_train_sample / float(batch_size))) batches = [(i * batch_size, min(nb_train_sample, (i + 1) * batch_size)) for i in range(0, nb_batch)] if negative_sampling == "NON-UNIFORM": print("Pre-procesing for non-uniform negative sampling!") node_degree = np.zeros(graph_nodes_num) for i in range(graph_edge_num): node_degree[adj_list[i, 0]] += 1 node_degree[adj_list[i, 1]] += 1 norm = sum([math.pow(node_degree[i], power) for i in range(graph_nodes_num)]) sampling_table = np.zeros(int(table_size), dtype=np.uint32) p = 0 i = 0 for j in range(graph_nodes_num): p += float(math.pow(node_degree[j], power)) / norm while i < table_size and float(i) / table_size < p: sampling_table[i] = j i += 1 while 1: for batch_index, (batch_start, batch_end) in enumerate(batches): pos_edge_list = index_array[batch_start:batch_end] pos_left_nodes = adj_list[pos_edge_list, 0] pos_right_nodes = adj_list[pos_edge_list, 1] pos_relation_y = batch_size_ones[0:len(pos_edge_list)] neg_left_nodes = np.zeros(len(pos_edge_list) * negativeRatio, dtype=np.int32) neg_right_nodes = np.zeros(len(pos_edge_list) * negativeRatio, dtype=np.int32) neg_relation_y = np.zeros(len(pos_edge_list) * negativeRatio, dtype=np.int8) h = 0 for i in pos_left_nodes: for k in range(negativeRatio): rn = sampling_table[random.randint(0, table_size - 1)] if negative_sampling == "NON-UNIFORM" else random.randint( 0, graph_nodes_num - 1) while mat[i, rn] == 1 or i == rn: rn = sampling_table[random.randint(0, table_size - 1)] if negative_sampling == "NON-UNIFORM" else random.randint( 0, graph_nodes_num - 1) neg_left_nodes[h] = i neg_right_nodes[h] = rn h += 1 left_nodes = np.concatenate((pos_left_nodes, neg_left_nodes), axis=0) right_nodes = np.concatenate((pos_right_nodes, neg_right_nodes), axis=0) relation_y = np.concatenate((pos_relation_y, neg_relation_y), axis=0) yield ([left_nodes, right_nodes], [relation_y]) def _model(self, graph_nodes_num, dimension): left_input = Input(shape=(1,)) right_input = Input(shape=(1,)) left_model = Sequential() left_model.add(Embedding(input_dim=graph_nodes_num + 1, output_dim=dimension, input_length=1, mask_zero=False)) left_model.add(Reshape((dimension,))) right_model = Sequential() right_model.add(Embedding(input_dim=graph_nodes_num + 1, output_dim=dimension, input_length=1, mask_zero=False)) right_model.add(Reshape((dimension,))) left_embed = left_model(left_input) right_embed = left_model(right_input) left_right_dot = dot(inputs=[left_embed, right_embed], axes=1, name="left_right_dot") model = Model(inputs=[left_input, right_input], outputs=[left_right_dot]) embed_generator = Model(inputs=[left_input, right_input], outputs=[left_embed, right_embed]) return model, embed_generator def _line_loss(self, y_true, y_pred): coeff = y_true*2 - 1 return -K.mean(K.log(K.sigmoid(coeff*y_pred))) def fit(self, adj_list, batch_size, negative_ratio, negative_sampling, epoch_num): self.steps_per_epoch = int(self.e / batch_size) self.epoch_train_size = (1 + negative_ratio) * self.e # 产生训练样本 data = self._generate_batch_train(adj_list, self.n, self.e, batch_size, negative_ratio, negative_sampling) self.model, self.embed_generator = self._model(self.n, self.dimension) # model.summary() self.model.compile(optimizer='rmsprop', loss={'left_right_dot': self._line_loss}) self.model.fit_generator(data, steps_per_epoch=self.epoch_train_size / batch_size, epochs=epoch_num, verbose=1) def predict(self, data): # link prediction return self.embed_generator.predict_on_batch(data)
line = LINE(graph_edge_num, graph_nodes_num, dimension) line.fit(adj_list, batch_size, negative_ratio, negative_sampling, epoch_num)
What are the pre-requisites for running this code?
usage