本文在github上:https://github.com/Mrxiahelong/Unsupervised-Author-Disambiguation/实现
这里我们使用三张图结构 分别是paper-coauhor-paper,paper-cotitle-paper,paper-covenue-paper,也就是结点类型一种,边类型三种,我们的metapath类型为coauthor-cotitle-coauthor-cotitle.
-
def positive_sampler(path):
-
'''
-
对每一条path建立一个window大小的滑动窗口,
-
例如 0 1 2 3 4,window大小为2,则返回 pos_u=[0,0,1,1,1.....],pos_v=[1,2,0,2,3....]
-
'''
-
pos_u,pos_v=[],[]
-
for i
in range(len(path)):
-
if len(path)==
1:
-
continue
-
u=path[i]
-
v=np.concatenate([path[max(i-window,
0):i],path[i+
1:i+window+
1]],axis=
0)
-
pos_u.extend([u]*len(v))
-
pos_v.extend(v)
-
return pos_u,pos_v
-
def get_negative_ratio(metapath):
-
'''
-
对所有结点根据出现频率建立negative_ratio,出现频率越大的越有可能出现在负采样中
-
返回的ratio 是对每一个结点被负采样的概率
-
'''
-
node_frequency=dict()
-
sentence_count,node_count=
0,
0
-
for path
in metapath:
-
for node
in path:
-
node_frequency[node]=node_frequency.get(node,
0)+
1
-
node_count+=
1
-
pow_frequency=np.array(list(map(
lambda x:x[
-1],sorted(node_frequency.items(),key=
lambda asd:asd[
0]))))**
0.75
-
node_pow=np.sum(pow_frequency)
-
ratio=pow_frequency/node_pow
-
return ratio
-
def negative_sampler(path,ratio,nodes):
-
'''
-
根据上一个函数的到负采样的概率表ratio,进行负采样
-
'''
-
negtives_size=
5
-
negatives=[]
-
while len(negatives)<
5:
-
temp=np.random.choice(nodes, size=negtives_size-len(negatives), replace=
False, p=ratio)
-
negatives.extend([node
for node
in temp
if node
not
in path])
-
return negatives
-
def create_node2node_dict(graph):
-
'''
-
输入的是dgl建立的图
-
返回的是个字典类型,保存的是在该图中,每个结点可以到达的结点
-
'''
-
src_dst={}
-
for src,dst
in zip(graph.edges()[
0],graph.edges()[
1]):
-
src,dst=src.item(),dst.item()
-
if src
not
in src_dst.keys():
-
src_dst[src]=[]
-
src_dst[src].append(dst)
-
return src_dst
-
window=
2
# 这里是取metapath时的窗口大小
-
metapaths=[]
#所有的metapath
-
num_walks=
10
#每个结点run 多少遍
-
walk_len=
100
#每个path的长度
-
metapath_type=[
'coauthor',
'covenue',
'coauthor',
'cotitle']
#根据论文,作者使用的是AVAT
-
-
edge_per_graph={}
#对应每个图,建立个字典,每个字典的key为结点编号,value为key在该图中可以到达的结点编号
-
edge_per_graph[
'coauthor']=create_node2node_dict(coauthor_graph)
-
edge_per_graph[
'cotitle']=create_node2node_dict(cotitle_graph)
-
edge_per_graph[
'covenue']=create_node2node_dict(covenue_graph)
-
weights_all_graph={
'coauthor':weights_coauthor,
'cotitle':weights_cotitle,
'covenue':weights_covenue}
-
-
def Is_isolate(node):
-
for rel
in metapath_type:
-
if node
in edge_per_graph[rel].keys():
-
return
0
-
return
1
-
for walk
in tqdm(range(num_walks)):
-
for cur_node
in list(range(len(labels))):
#对图里的每个结点循环一次
-
stop=
0
-
path=[]
-
path.append(cur_node)
-
while len(path)<walk_len
and stop==
0:
-
for rel
in metapath_type:
-
if len(path)==walk_len
or Is_isolate(cur_node):
-
stop=
1
-
break
-
if edge_per_graph[rel].get(cur_node,
-1)==
-1:
-
continue
-
-
cand_nodes=edge_per_graph[rel][cur_node]
-
weights_per_candnodes=weights_all_graph[rel][cur_node][cand_nodes]
#这儿我们使用了带权重的路径选择,如果不需要,把这句和下面这句注释掉,然的后吧choice函数的p参数去掉
-
-
weighted_ratio=weights_per_candnodes*
1.0/np.sum(weights_per_candnodes)
-
cur_node=np.random.choice(cand_nodes,size=
1,p=weighted_ratio)[
0]
-
path.append(cur_node)
-
metapaths.append(path)
-
-
pos_us,pos_vs,neg_vs=[],[],[]
-
nodes=list(range(sum_papers))
-
ratio=get_negative_ratio(metapaths)
-
for path
in metapaths:
-
pos_u,pos_v=positive_sampler(path)
-
for u,v
in zip(pos_u,pos_v):
-
negative_nodes=negative_sampler(path,ratio,nodes)
-
neg_vs.append(negative_nodes)
-
pos_us.extend(pos_u)
-
pos_vs.extend(pos_v)
-
pos_us=torch.LongTensor(pos_us)
-
pos_vs=torch.LongTensor(pos_vs)
-
neg_vs=torch.LongTensor(neg_vs)
这儿得到的metapaths就是总的metapath,pos_us和pos_vs的元素一 一对应作为正对,neg_vs作为负对,对应着skip_gram模型的ui,uc,uj,如下
-
#单纯的metapath2vec
-
import torch
-
import torch.nn
as nn
-
import torch.nn.functional
as F
-
from torch.nn
import init
-
-
"""
-
u_embedding: Embedding for center word.
-
v_embedding: Embedding for neighbor words.
-
"""
-
-
-
class SkipGramModel(nn.Module):
-
-
def __init__(self, emb_size, emb_dimension):
-
super(SkipGramModel, self).__init__()
-
self.emb_size = emb_size
-
self.emb_dimension = emb_dimension
-
self.u_embeddings = nn.Embedding(emb_size, emb_dimension)
-
self.v_embeddings = nn.Embedding(emb_size, emb_dimension)
-
-
initrange =
1.0 / self.emb_dimension
-
init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
-
init.constant_(self.v_embeddings.weight.data,
0)
-
-
def forward(self, pos_u, pos_v, neg_v):
-
emb_u = self.u_embeddings(pos_u)
-
emb_v = self.v_embeddings(pos_v)
-
emb_neg_v = self.v_embeddings(neg_v)
-
-
score = torch.sum(torch.mul(emb_u, emb_v), dim=
1)
-
score = torch.clamp(score, max=
10, min=
-10)
-
score = -F.logsigmoid(score)
-
-
neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(
2)).squeeze()
-
neg_score = torch.clamp(neg_score, max=
10, min=
-10)
-
neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=
1)
-
-
return torch.mean(score + neg_score)
-
skip_model=SkipGramModel(sum_papers,
64)
-
optimizer=torch.optim.Adam(skip_model.parameters(),lr=
0.001)
-
for epoch
in range(
500):
-
optimizer.zero_grad()
-
loss=skip_model(torch.tensor(pos_us),torch.tensor(pos_vs),torch.tensor(neg_vs))
-
loss.backward()
-
optimizer.step()
-
losses.append(loss.item())
-
if epoch %
100==
0:
-
print(
'epoch {0} loss {1}'.format(epoch,loss))
embedding=skip_model.u_embeddings.weight.cpu().data.numpy()
这儿embedding就是得到的每个结点的embdding,可以用来做下游任务
转载:https://blog.csdn.net/xiadada2/article/details/117295281
查看评论