小言_互联网的博客

Metapath2vec 的python简单实现

393人阅读  评论(0)

本文在github上:https://github.com/Mrxiahelong/Unsupervised-Author-Disambiguation/实现

这里我们使用三张图结构  分别是paper-coauhor-paper,paper-cotitle-paper,paper-covenue-paper,也就是结点类型一种,边类型三种,我们的metapath类型为coauthor-cotitle-coauthor-cotitle.


  
  1. def positive_sampler(path):
  2. '''
  3. 对每一条path建立一个window大小的滑动窗口,
  4. 例如 0 1 2 3 4,window大小为2,则返回 pos_u=[0,0,1,1,1.....],pos_v=[1,2,0,2,3....]
  5. '''
  6. pos_u,pos_v=[],[]
  7. for i in range(len(path)):
  8. if len(path)== 1:
  9. continue
  10. u=path[i]
  11. v=np.concatenate([path[max(i-window, 0):i],path[i+ 1:i+window+ 1]],axis= 0)
  12. pos_u.extend([u]*len(v))
  13. pos_v.extend(v)
  14. return pos_u,pos_v
  15. def get_negative_ratio(metapath):
  16. '''
  17. 对所有结点根据出现频率建立negative_ratio,出现频率越大的越有可能出现在负采样中
  18. 返回的ratio 是对每一个结点被负采样的概率
  19. '''
  20. node_frequency=dict()
  21. sentence_count,node_count= 0, 0
  22. for path in metapath:
  23. for node in path:
  24. node_frequency[node]=node_frequency.get(node, 0)+ 1
  25. node_count+= 1
  26. pow_frequency=np.array(list(map( lambda x:x[ -1],sorted(node_frequency.items(),key= lambda asd:asd[ 0]))))** 0.75
  27. node_pow=np.sum(pow_frequency)
  28. ratio=pow_frequency/node_pow
  29. return ratio
  30. def negative_sampler(path,ratio,nodes):
  31. '''
  32. 根据上一个函数的到负采样的概率表ratio,进行负采样
  33. '''
  34. negtives_size= 5
  35. negatives=[]
  36. while len(negatives)< 5:
  37. temp=np.random.choice(nodes, size=negtives_size-len(negatives), replace= False, p=ratio)
  38. negatives.extend([node for node in temp if node not in path])
  39. return negatives

  
  1. def create_node2node_dict(graph):
  2. '''
  3. 输入的是dgl建立的图
  4. 返回的是个字典类型,保存的是在该图中,每个结点可以到达的结点
  5. '''
  6. src_dst={}
  7. for src,dst in zip(graph.edges()[ 0],graph.edges()[ 1]):
  8. src,dst=src.item(),dst.item()
  9. if src not in src_dst.keys():
  10. src_dst[src]=[]
  11. src_dst[src].append(dst)
  12. return src_dst
  13. window= 2 # 这里是取metapath时的窗口大小
  14. metapaths=[] #所有的metapath
  15. num_walks= 10 #每个结点run 多少遍
  16. walk_len= 100 #每个path的长度
  17. metapath_type=[ 'coauthor', 'covenue', 'coauthor', 'cotitle'] #根据论文,作者使用的是AVAT
  18. edge_per_graph={} #对应每个图,建立个字典,每个字典的key为结点编号,value为key在该图中可以到达的结点编号
  19. edge_per_graph[ 'coauthor']=create_node2node_dict(coauthor_graph)
  20. edge_per_graph[ 'cotitle']=create_node2node_dict(cotitle_graph)
  21. edge_per_graph[ 'covenue']=create_node2node_dict(covenue_graph)
  22. weights_all_graph={ 'coauthor':weights_coauthor, 'cotitle':weights_cotitle, 'covenue':weights_covenue}
  23. def Is_isolate(node):
  24. for rel in metapath_type:
  25. if node in edge_per_graph[rel].keys():
  26. return 0
  27. return 1
  28. for walk in tqdm(range(num_walks)):
  29. for cur_node in list(range(len(labels))): #对图里的每个结点循环一次
  30. stop= 0
  31. path=[]
  32. path.append(cur_node)
  33. while len(path)<walk_len and stop== 0:
  34. for rel in metapath_type:
  35. if len(path)==walk_len or Is_isolate(cur_node):
  36. stop= 1
  37. break
  38. if edge_per_graph[rel].get(cur_node, -1)== -1:
  39. continue
  40. cand_nodes=edge_per_graph[rel][cur_node]
  41. weights_per_candnodes=weights_all_graph[rel][cur_node][cand_nodes] #这儿我们使用了带权重的路径选择,如果不需要,把这句和下面这句注释掉,然的后吧choice函数的p参数去掉
  42. weighted_ratio=weights_per_candnodes* 1.0/np.sum(weights_per_candnodes)
  43. cur_node=np.random.choice(cand_nodes,size= 1,p=weighted_ratio)[ 0]
  44. path.append(cur_node)
  45. metapaths.append(path)
  46. pos_us,pos_vs,neg_vs=[],[],[]
  47. nodes=list(range(sum_papers))
  48. ratio=get_negative_ratio(metapaths)
  49. for path in metapaths:
  50. pos_u,pos_v=positive_sampler(path)
  51. for u,v in zip(pos_u,pos_v):
  52. negative_nodes=negative_sampler(path,ratio,nodes)
  53. neg_vs.append(negative_nodes)
  54. pos_us.extend(pos_u)
  55. pos_vs.extend(pos_v)
  56. pos_us=torch.LongTensor(pos_us)
  57. pos_vs=torch.LongTensor(pos_vs)
  58. neg_vs=torch.LongTensor(neg_vs)

这儿得到的metapaths就是总的metapath,pos_us和pos_vs的元素一 一对应作为正对,neg_vs作为负对,对应着skip_gram模型的ui,uc,uj,如下

 


  
  1. #单纯的metapath2vec
  2. import torch
  3. import torch.nn as nn
  4. import torch.nn.functional as F
  5. from torch.nn import init
  6. """
  7. u_embedding: Embedding for center word.
  8. v_embedding: Embedding for neighbor words.
  9. """
  10. class SkipGramModel(nn.Module):
  11. def __init__(self, emb_size, emb_dimension):
  12. super(SkipGramModel, self).__init__()
  13. self.emb_size = emb_size
  14. self.emb_dimension = emb_dimension
  15. self.u_embeddings = nn.Embedding(emb_size, emb_dimension)
  16. self.v_embeddings = nn.Embedding(emb_size, emb_dimension)
  17. initrange = 1.0 / self.emb_dimension
  18. init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
  19. init.constant_(self.v_embeddings.weight.data, 0)
  20. def forward(self, pos_u, pos_v, neg_v):
  21. emb_u = self.u_embeddings(pos_u)
  22. emb_v = self.v_embeddings(pos_v)
  23. emb_neg_v = self.v_embeddings(neg_v)
  24. score = torch.sum(torch.mul(emb_u, emb_v), dim= 1)
  25. score = torch.clamp(score, max= 10, min= -10)
  26. score = -F.logsigmoid(score)
  27. neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze( 2)).squeeze()
  28. neg_score = torch.clamp(neg_score, max= 10, min= -10)
  29. neg_score = -torch.sum(F.logsigmoid(-neg_score), dim= 1)
  30. return torch.mean(score + neg_score)

  
  1. skip_model=SkipGramModel(sum_papers, 64)
  2. optimizer=torch.optim.Adam(skip_model.parameters(),lr= 0.001)
  3. for epoch in range( 500):
  4. optimizer.zero_grad()
  5. loss=skip_model(torch.tensor(pos_us),torch.tensor(pos_vs),torch.tensor(neg_vs))
  6. loss.backward()
  7. optimizer.step()
  8. losses.append(loss.item())
  9. if epoch % 100== 0:
  10. print( 'epoch {0} loss {1}'.format(epoch,loss))
embedding=skip_model.u_embeddings.weight.cpu().data.numpy()

这儿embedding就是得到的每个结点的embdding,可以用来做下游任务


转载:https://blog.csdn.net/xiadada2/article/details/117295281
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场