Open Richard3927 opened 2 years ago
由于上周采取的用Windows来跑本地的demo,为了后续的进一步操作,本周采取了搭建虚拟机,用docker来运行项目的方法。遇到的问题有:虚拟机的搭建,环境的基本配置(maven,spark,java等等),git的拉取项目到本地,以及实际运行。由于之前未接触过相关知识与领域,耽误了一定时间。
参考的链接:https://blog.csdn.net/u012151283/article/details/87255951
def _get_order_degreelist_node(self, root, max_num_layers=None): if max_num_layers is None: max_num_layers = float('inf') ordered_degree_sequence_dict = {} visited = [False] * len(self.graph.nodes()) queue = deque() level = 0 queue.append(root) visited[root] = True while (len(queue) > 0 and level <= max_num_layers): count = len(queue) if self.opt1_reduce_len: degree_list = {} else: degree_list = [] while (count > 0): top = queue.popleft() node = self.idx2node[top] degree = len(self.graph[node]) if self.opt1_reduce_len: degree_list[degree] = degree_list.get(degree, 0) + 1 else: degree_list.append(degree) for nei in self.graph[node]: nei_idx = self.node2idx[nei] if not visited[nei_idx]: visited[nei_idx] = True queue.append(nei_idx) count -= 1 if self.opt1_reduce_len: orderd_degree_list = [(degree, freq) for degree, freq in degree_list.items()] orderd_degree_list.sort(key=lambda x: x[0]) else: orderd_degree_list = sorted(degree_list) ordered_degree_sequence_dict[level] = orderd_degree_list level += 1 return ordered_degree_sequence_dict
def _compute_ordered_degreelist(self, max_num_layers): degreeList = {} vertices = self.idx # self.g.nodes() for v in vertices: degreeList[v] = self._get_order_degreelist_node(v, max_num_layers) return degreeList
def compute_dtw_dist(part_list, degreeList, dist_func): dtw_dist = {} for v1, nbs in part_list: lists_v1 = degreeList[v1] # lists_v1 :orderd degree list of v1 for v2 in nbs: lists_v2 = degreeList[v2] # lists_v1 :orderd degree list of v2 max_layer = min(len(lists_v1), len(lists_v2)) # valid layer dtw_dist[v1, v2] = {} for layer in range(0, max_layer): dist, path = fastdtw( lists_v1[layer], lists_v2[layer], radius=1, dist=dist_func) dtw_dist[v1, v2][layer] = dist return dtw_dist
def _compute_structural_distance(self, max_num_layers, workers=1, verbose=0,): if os.path.exists(self.temp_path+'structural_dist.pkl'): structural_dist = pd.read_pickle( self.temp_path+'structural_dist.pkl') else: if self.opt1_reduce_len: dist_func = cost_max else: dist_func = cost if os.path.exists(self.temp_path + 'degreelist.pkl'): degreeList = pd.read_pickle(self.temp_path + 'degreelist.pkl') else: degreeList = self._compute_ordered_degreelist(max_num_layers) pd.to_pickle(degreeList, self.temp_path + 'degreelist.pkl') if self.opt2_reduce_sim_calc: degrees = self._create_vectors() degreeListsSelected = {} vertices = {} n_nodes = len(self.idx) for v in self.idx: # c:list of vertex nbs = get_vertices( v, len(self.graph[self.idx2node[v]]), degrees, n_nodes) vertices[v] = nbs # store nbs degreeListsSelected[v] = degreeList[v] # store dist for n in nbs: # store dist of nbs degreeListsSelected[n] = degreeList[n] else: vertices = {} for v in degreeList: vertices[v] = [vd for vd in degreeList.keys() if vd > v] results = Parallel(n_jobs=workers, verbose=verbose,)( delayed(compute_dtw_dist)(part_list, degreeList, dist_func) for part_list in partition_dict(vertices, workers)) dtw_dist = dict(ChainMap(*results)) structural_dist = convert_dtw_struc_dist(dtw_dist) pd.to_pickle(structural_dist, self.temp_path + 'structural_dist.pkl') return structural_dist
def _get_transition_probs(self, layers_adj, layers_distances): layers_alias = {} layers_accept = {} for layer in layers_adj: neighbors = layers_adj[layer] layer_distances = layers_distances[layer] node_alias_dict = {} node_accept_dict = {} norm_weights = {} for v, neighbors in neighbors.items(): e_list = [] sum_w = 0.0 for n in neighbors: if (v, n) in layer_distances: wd = layer_distances[v, n] else: wd = layer_distances[n, v] w = np.exp(-float(wd)) e_list.append(w) sum_w += w e_list = [x / sum_w for x in e_list] norm_weights[v] = e_list accept, alias = create_alias_table(e_list) node_alias_dict[v] = alias node_accept_dict[v] = accept pd.to_pickle( norm_weights, self.temp_path + 'norm_weights_distance-layer-' + str(layer)+'.pkl') layers_alias[layer] = node_alias_dict layers_accept[layer] = node_accept_dict return layers_accept, layers_alias
def prepare_biased_walk(self,): sum_weights = {} sum_edges = {} average_weight = {} gamma = {} layer = 0 while (os.path.exists(self.temp_path+'norm_weights_distance-layer-' + str(layer))): probs = pd.read_pickle( self.temp_path+'norm_weights_distance-layer-' + str(layer)) for v, list_weights in probs.items(): sum_weights.setdefault(layer, 0) sum_edges.setdefault(layer, 0) sum_weights[layer] += sum(list_weights) sum_edges[layer] += len(list_weights) average_weight[layer] = sum_weights[layer] / sum_edges[layer] gamma.setdefault(layer, {}) for v, list_weights in probs.items(): num_neighbours = 0 for w in list_weights: if (w > average_weight[layer]): num_neighbours += 1 gamma[layer][v] = num_neighbours layer += 1 pd.to_pickle(average_weight, self.temp_path + 'average_weight') pd.to_pickle(gamma, self.temp_path + 'gamma.pkl')
本周任务:不同层次结构相似度计算,struct2Vec论文学习。
实际情况:
1.上周的优化
由于上周采取的用Windows来跑本地的demo,为了后续的进一步操作,本周采取了搭建虚拟机,用docker来运行项目的方法。遇到的问题有:虚拟机的搭建,环境的基本配置(maven,spark,java等等),git的拉取项目到本地,以及实际运行。由于之前未接触过相关知识与领域,耽误了一定时间。
2.其他,进行了论文的进一步学习。基本分为以下几个步骤。(:可参考 https://zhuanlan.zhihu.com/p/56733145)
a.算法原理:Struc2Vec是从空间结构相似性的角度定义顶点相似度的。
b.定点对距离的定义。
c.构建层次带权图。
d.采样获取顶点序列。
e.优化
3.基本算法实现的学习
参考的链接:https://blog.csdn.net/u012151283/article/details/87255951
a.对距离定义的实现:
b.构建层次带权图
4.下周目标:结合上周学习,用angel实际实现层次带权图的构建。尝试实现多层次带权网络。