Angel-ML / angel

A Flexible and Powerful Parameter Server for large-scale machine learning
Other
6.73k stars 1.61k forks source link

2022Tencent Rhino-bird Open-source Training Program—Angel-Shengyang Li —— week2 #1222

Open Richard3927 opened 2 years ago

Richard3927 commented 2 years ago

本周任务:不同层次结构相似度计算,struct2Vec论文学习。

实际情况:

1.上周的优化

由于上周采取的用Windows来跑本地的demo,为了后续的进一步操作,本周采取了搭建虚拟机,用docker来运行项目的方法。遇到的问题有:虚拟机的搭建,环境的基本配置(maven,spark,java等等),git的拉取项目到本地,以及实际运行。由于之前未接触过相关知识与领域,耽误了一定时间。

2.其他,进行了论文的进一步学习。基本分为以下几个步骤。(:可参考 https://zhuanlan.zhihu.com/p/56733145

a.算法原理:Struc2Vec是从空间结构相似性的角度定义顶点相似度的。

b.定点对距离的定义。

c.构建层次带权图。

d.采样获取顶点序列。

e.优化

3.基本算法实现的学习

参考的链接:https://blog.csdn.net/u012151283/article/details/87255951

a.对距离定义的实现:

def _get_order_degreelist_node(self, root, max_num_layers=None):
    if max_num_layers is None:
        max_num_layers = float('inf')

    ordered_degree_sequence_dict = {}
    visited = [False] * len(self.graph.nodes())
    queue = deque()
    level = 0
    queue.append(root)
    visited[root] = True

    while (len(queue) > 0 and level <= max_num_layers):

        count = len(queue)
        if self.opt1_reduce_len:
            degree_list = {}
        else:
            degree_list = []
        while (count > 0):

            top = queue.popleft()
            node = self.idx2node[top]
            degree = len(self.graph[node])

            if self.opt1_reduce_len:
                degree_list[degree] = degree_list.get(degree, 0) + 1
            else:
                degree_list.append(degree)

            for nei in self.graph[node]:
                nei_idx = self.node2idx[nei]
                if not visited[nei_idx]:
                    visited[nei_idx] = True
                    queue.append(nei_idx)
            count -= 1
        if self.opt1_reduce_len:
            orderd_degree_list = [(degree, freq)
                                  for degree, freq in degree_list.items()]
            orderd_degree_list.sort(key=lambda x: x[0])
        else:
            orderd_degree_list = sorted(degree_list)
        ordered_degree_sequence_dict[level] = orderd_degree_list
        level += 1

    return ordered_degree_sequence_dict
def _compute_ordered_degreelist(self, max_num_layers):

    degreeList = {}
    vertices = self.idx  # self.g.nodes()
    for v in vertices:
        degreeList[v] = self._get_order_degreelist_node(v, max_num_layers)
    return degreeList
def compute_dtw_dist(part_list, degreeList, dist_func):
    dtw_dist = {}
    for v1, nbs in part_list:
        lists_v1 = degreeList[v1]  # lists_v1 :orderd degree list of v1
        for v2 in nbs:
            lists_v2 = degreeList[v2]  # lists_v1 :orderd degree list of v2
            max_layer = min(len(lists_v1), len(lists_v2))  # valid layer
            dtw_dist[v1, v2] = {}
            for layer in range(0, max_layer):
                dist, path = fastdtw(
                    lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func)
                dtw_dist[v1, v2][layer] = dist
    return dtw_dist
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,):

    if os.path.exists(self.temp_path+'structural_dist.pkl'):
        structural_dist = pd.read_pickle(
            self.temp_path+'structural_dist.pkl')
    else:
        if self.opt1_reduce_len:
            dist_func = cost_max
        else:
            dist_func = cost

        if os.path.exists(self.temp_path + 'degreelist.pkl'):
            degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl')
        else:
            degreeList = self._compute_ordered_degreelist(max_num_layers)
            pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl')

        if self.opt2_reduce_sim_calc:
            degrees = self._create_vectors()
            degreeListsSelected = {}
            vertices = {}
            n_nodes = len(self.idx)
            for v in self.idx:  # c:list of vertex
                nbs = get_vertices(
                    v, len(self.graph[self.idx2node[v]]), degrees, n_nodes)
                vertices[v] = nbs  # store nbs
                degreeListsSelected[v] = degreeList[v]  # store dist
                for n in nbs:
                    # store dist of nbs
                    degreeListsSelected[n] = degreeList[n]
        else:
            vertices = {}
            for v in degreeList:
                vertices[v] = [vd for vd in degreeList.keys() if vd > v]

        results = Parallel(n_jobs=workers, verbose=verbose,)(
            delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers))
        dtw_dist = dict(ChainMap(*results))

        structural_dist = convert_dtw_struc_dist(dtw_dist)
        pd.to_pickle(structural_dist, self.temp_path +
                     'structural_dist.pkl')

    return structural_dist

b.构建层次带权图

def _get_transition_probs(self, layers_adj, layers_distances):
    layers_alias = {}
    layers_accept = {}

    for layer in layers_adj:

        neighbors = layers_adj[layer]
        layer_distances = layers_distances[layer]
        node_alias_dict = {}
        node_accept_dict = {}
        norm_weights = {}

        for v, neighbors in neighbors.items():
            e_list = []
            sum_w = 0.0

            for n in neighbors:
                if (v, n) in layer_distances:
                    wd = layer_distances[v, n]
                else:
                    wd = layer_distances[n, v]
                w = np.exp(-float(wd))
                e_list.append(w)
                sum_w += w

            e_list = [x / sum_w for x in e_list]
            norm_weights[v] = e_list
            accept, alias = create_alias_table(e_list)
            node_alias_dict[v] = alias
            node_accept_dict[v] = accept

        pd.to_pickle(
            norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl')

        layers_alias[layer] = node_alias_dict
        layers_accept[layer] = node_accept_dict

    return layers_accept, layers_alias
def prepare_biased_walk(self,):

    sum_weights = {}
    sum_edges = {}
    average_weight = {}
    gamma = {}
    layer = 0
    while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer))):
        probs = pd.read_pickle(
            self.temp_path+'norm_weights_distance-layer-' + str(layer))
        for v, list_weights in probs.items():
            sum_weights.setdefault(layer, 0)
            sum_edges.setdefault(layer, 0)
            sum_weights[layer] += sum(list_weights)
            sum_edges[layer] += len(list_weights)

        average_weight[layer] = sum_weights[layer] / sum_edges[layer]

        gamma.setdefault(layer, {})

        for v, list_weights in probs.items():
            num_neighbours = 0
            for w in list_weights:
                if (w > average_weight[layer]):
                    num_neighbours += 1
            gamma[layer][v] = num_neighbours

        layer += 1

    pd.to_pickle(average_weight, self.temp_path + 'average_weight')
    pd.to_pickle(gamma, self.temp_path + 'gamma.pkl')

4.下周目标:结合上周学习,用angel实际实现层次带权图的构建。尝试实现多层次带权网络。