VahidooX / LINE

This is an implementation of LINE(Large-scale information network embedding) algorithm.
72 stars 28 forks source link

simple wrapper for this implemention for Keras 2.1.1 #3

Open Kiris-tingna opened 6 years ago

Kiris-tingna commented 6 years ago
class LINE(object):
    def __init__(self, graph_edge_num, graph_nodes_num,
                 dimension):
        self.e = graph_edge_num
        self.n = graph_nodes_num
        self.steps_per_epoch = None
        self.epoch_train_size = None
        self.dimension = dimension

    def _generate_batch_train(self, adj_list,
                         graph_nodes_num, graph_edge_num,
                         batch_size, negativeRatio,
                         negative_sampling):
        #  使用 negative sampling 优化
        table_size = 1e8
        power = 0.75
        sampling_table = None

        data = np.ones((adj_list.shape[0]), dtype=np.int8)
        mat = csr_matrix((data, (adj_list[:, 0], adj_list[:, 1])), shape=(graph_nodes_num, graph_nodes_num),
                         dtype=np.int8)
        batch_size_ones = np.ones((batch_size), dtype=np.int8)

        nb_train_sample = adj_list.shape[0]
        index_array = np.arange(nb_train_sample)

        nb_batch = int(np.ceil(nb_train_sample / float(batch_size)))
        batches = [(i * batch_size, min(nb_train_sample, (i + 1) * batch_size)) for i in range(0, nb_batch)]

        if negative_sampling == "NON-UNIFORM":
            print("Pre-procesing for non-uniform negative sampling!")
            node_degree = np.zeros(graph_nodes_num)

            for i in range(graph_edge_num):
                node_degree[adj_list[i, 0]] += 1
                node_degree[adj_list[i, 1]] += 1

            norm = sum([math.pow(node_degree[i], power) for i in range(graph_nodes_num)])

            sampling_table = np.zeros(int(table_size), dtype=np.uint32)

            p = 0
            i = 0
            for j in range(graph_nodes_num):
                p += float(math.pow(node_degree[j], power)) / norm
                while i < table_size and float(i) / table_size < p:
                    sampling_table[i] = j
                    i += 1

        while 1:

            for batch_index, (batch_start, batch_end) in enumerate(batches):
                pos_edge_list = index_array[batch_start:batch_end]
                pos_left_nodes = adj_list[pos_edge_list, 0]
                pos_right_nodes = adj_list[pos_edge_list, 1]

                pos_relation_y = batch_size_ones[0:len(pos_edge_list)]

                neg_left_nodes = np.zeros(len(pos_edge_list) * negativeRatio, dtype=np.int32)
                neg_right_nodes = np.zeros(len(pos_edge_list) * negativeRatio, dtype=np.int32)

                neg_relation_y = np.zeros(len(pos_edge_list) * negativeRatio, dtype=np.int8)

                h = 0
                for i in pos_left_nodes:
                    for k in range(negativeRatio):
                        rn = sampling_table[random.randint(0,
                                                           table_size - 1)] if negative_sampling == "NON-UNIFORM" else random.randint(
                            0, graph_nodes_num - 1)
                        while mat[i, rn] == 1 or i == rn:
                            rn = sampling_table[random.randint(0,
                                                               table_size - 1)] if negative_sampling == "NON-UNIFORM" else random.randint(
                                0, graph_nodes_num - 1)
                        neg_left_nodes[h] = i
                        neg_right_nodes[h] = rn
                        h += 1

                left_nodes = np.concatenate((pos_left_nodes, neg_left_nodes), axis=0)
                right_nodes = np.concatenate((pos_right_nodes, neg_right_nodes), axis=0)
                relation_y = np.concatenate((pos_relation_y, neg_relation_y), axis=0)

                yield ([left_nodes, right_nodes], [relation_y])

    def _model(self, graph_nodes_num, dimension):
        left_input = Input(shape=(1,))
        right_input = Input(shape=(1,))

        left_model = Sequential()
        left_model.add(Embedding(input_dim=graph_nodes_num + 1, output_dim=dimension, input_length=1, mask_zero=False))
        left_model.add(Reshape((dimension,)))

        right_model = Sequential()
        right_model.add(Embedding(input_dim=graph_nodes_num + 1, output_dim=dimension, input_length=1, mask_zero=False))
        right_model.add(Reshape((dimension,)))

        left_embed = left_model(left_input)
        right_embed = left_model(right_input)

        left_right_dot = dot(inputs=[left_embed, right_embed], axes=1, name="left_right_dot")
        model = Model(inputs=[left_input, right_input], outputs=[left_right_dot])
        embed_generator = Model(inputs=[left_input, right_input], outputs=[left_embed, right_embed])

        return model, embed_generator

    def _line_loss(self, y_true, y_pred):
        coeff = y_true*2 - 1
        return -K.mean(K.log(K.sigmoid(coeff*y_pred)))

    def fit(self, adj_list, batch_size, negative_ratio, negative_sampling, epoch_num):
        self.steps_per_epoch = int(self.e / batch_size)
        self.epoch_train_size = (1 + negative_ratio) * self.e

        # 产生训练样本
        data = self._generate_batch_train(adj_list, self.n, self.e, batch_size, negative_ratio,
                                          negative_sampling)

        self.model, self.embed_generator = self._model(self.n, self.dimension)
        # model.summary()
        self.model.compile(optimizer='rmsprop', loss={'left_right_dot': self._line_loss})
        self.model.fit_generator(data, steps_per_epoch=self.epoch_train_size / batch_size, epochs=epoch_num, verbose=1)

    def predict(self, data):
        # link prediction
        return self.embed_generator.predict_on_batch(data)

usage

line = LINE(graph_edge_num, graph_nodes_num, dimension)
line.fit(adj_list, batch_size, negative_ratio, negative_sampling, epoch_num)
Zafar-southeast commented 5 years ago

What are the pre-requisites for running this code?