Mol2vec

Mol2vec | 一种无监督机器学习方法的分子亚结构向量表示

2020-04-01 15:16 1597人阅读评论(0)

受自然语言处理技术的启发，这里介绍Mol2vec是一种无监督的机器学习方法，用于学习分子亚结构的向量表示。就像Word2vec模型一样，密切相关的单词的向量在向量空间中非常接近，Mol2vec可以学习指向化学相关子结构的相似方向的分子子结构的向量表示。通过将各个子结构的向量求和，最终可以将化合物编码为向量，例如，将其馈入有监督的机器学习方法中以预测化合物的性质。通过在由所有可用化学物质组成的所谓化合物主体上训练无监督的机器学习方法，可以获取底层的子结构矢量嵌入。生成的Mol2vec模型进行一次预训练，产生密集的矢量表示，并克服了常见复合特征表示的缺点，例如稀疏性和位冲突。在几个化合物特性和生物活性数据集上证明了预测能力，并将其与作为参考化合物表示形式的Morgan指纹图谱的结果进行了比较。 Mol2vec可以轻松地与ProtVec结合使用，后者对蛋白质序列采用相同的Word2vec概念，从而产生了一种蛋白质化学计量学方法，该方法不依赖于比对，因此也可以轻松用于序列相似性低的蛋白质。

Mol2vec基本实例

导入库

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
import matplotlib.pyplot as plt
import seaborn as sns
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg
% matplotlib inline

氨基酸的标准SMILES

aa_smis = ['CC(N)C(=O)O', 'N=C(N)NCCCC(N)C(=O)O', 'NC(=O)CC(N)C(=O)O', 'NC(CC(=O)O)C(=O)O',
          'NC(CS)C(=O)O', 'NC(CCC(=O)O)C(=O)O', 'NC(=O)CCC(N)C(=O)O', 'NCC(=O)O',
          'NC(Cc1cnc[nH]1)C(=O)O', 'CCC(C)C(N)C(=O)O', 'CC(C)CC(N)C(=O)O', 'NCCCCC(N)C(=O)O',
          'CSCCC(N)C(=O)O', 'NC(Cc1ccccc1)C(=O)O', 'O=C(O)C1CCCN1', 'NC(CO)C(=O)O',
          'CC(O)C(N)C(=O)O', 'NC(Cc1c[nH]c2ccccc12)C(=O)O', 'NC(Cc1ccc(O)cc1)C(=O)O',
          'CC(C)C(N)C(=O)O']
aa_codes = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLU', 'GLN', 'GLY', 'HIS', 'ILE', 
            'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL']

绘制分子

aas = [Chem.MolFromSmiles(x) for x in aa_smis]
Draw.MolsToGridImage(aas, molsPerRow=5, useSVG=False, legends=aa_codes)

将分子的子结构编码为单词

sentence = mol2alt_sentence(aas[0], 1)
sentence

['2246728737',
 '3537119515',
 '2245273601',
 '2655406212',
 '847957139',
 '2599973650',
 '2246699815',
 '3855312692',
 '864942730',
 '1510328189',
 '864662311',
 '1533864325']

depict_identifier(aas[0], 2246728737, 1)

it = IdentifierTable(sentence, [aas[0]]*len(sentence), [sentence]*len(sentence), 5, 1)
it

探索Mol2vec嵌入

from gensim.models import word2vec

Load a pre-trained Mol2vec model which was trained on 20 million compounds downloaded from ZINC using:

radius 1
UNK to replace all identifiers that appear less than 4 times
skip-gram and window size of 10
resulting in 300 dimensional embeddings

model = word2vec.Word2Vec.load('model_300dim.pkl')
len(model.wv.vocab.keys())

21003

model.wv.word_vec('2246728737')

array([ 0.33040667, -0.05939463, -0.07027855,  0.02016276, -0.14508724,
       -0.48153105, -0.0236442 , -0.29940802, -0.32859853, -0.10312873,
        0.2754865 , -0.44724599,  0.11692217, -0.07740143, -0.27273551,
        0.01656464, -0.38155788,  0.21124844, -0.2526589 ,  0.02486573,
       -0.18147612,  0.06457369, -0.02083556,  0.00237304,  0.49741328,
       -0.26439488, -0.3907159 , -0.08300006,  0.40189591,  0.12206856,
        0.12848134,  0.09424117,  0.39673623,  0.21401069, -0.66497356,
       -0.37454313,  0.21419369,  0.38191557,  0.02547926,  0.30835193,
        0.08150365,  0.39426079,  0.38589329,  0.36875907, -0.16996302,
       -0.35906705, -0.24990252, -0.05080098,  0.27163431,  0.25132433,
        0.50707805, -0.01323589,  0.00109474, -0.33966413,  0.0357844 ,
       -0.03588478, -0.10111088, -0.30536383, -0.16563182,  0.13768682,
       -0.07433363,  0.34545162,  0.56460357,  0.43559453, -0.27687258,
       -0.09035115, -0.04313745,  0.30894887, -0.33147016, -0.08685276,
       -0.30704996, -0.33592924,  0.50410104, -0.58678478, -0.04501853,
        0.15201668,  0.2340053 ,  0.23933883, -0.25818124,  0.05749293,
        0.36298212,  0.31270099,  0.20382218, -0.10048786, -0.43536556,
       -0.0063422 , -0.03890313,  0.39855656, -0.01665197, -0.26888278,
        0.72951472,  0.15831621,  0.10398436, -0.16989805,  0.52441496,
       -0.10802919,  0.27680042,  0.49415818, -0.46270421,  0.04257036], dtype=float32)

aa_sentences = [mol2alt_sentence(x, 1) for x in aas]
flat_list = [item for sublist in aa_sentences for item in sublist]
aa_identifiers_unique = set(flat_list)

df_vec = pd.DataFrame()
df_vec['identifier'] = list(aa_identifiers_unique)
df_vec.index = df_vec['identifier']

len(df_vec)

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

pca_model = PCA(n_components=30)
tsne_model = TSNE(n_components=2, perplexity=10, n_iter=1000, metric = 'cosine')
tsne_pca = tsne_model.fit_transform(pca_model.fit_transform([model.wv.word_vec(x) for x in aa_identifiers_unique]))

df_vec['PCA-t-SNE-c1'] = tsne_pca.T[0]
df_vec['PCA-t-SNE-c2'] = tsne_pca.T[1]
df_vec.head(3)

projections = df_vec.to_dict()

def get_values(identifier, projections):
    return np.array((projections['PCA-t-SNE-c1'][str(identifier)],projections['PCA-t-SNE-c2'][str(identifier)]))

get_values(2246728737, projections)

array([  9.18119335,  84.72703552], dtype=float32)

绘制氨基酸（ALA）亚结构向量

aa_values = [get_values(x, projections) for x in aa_sentences[0]]
plot_2D_vectors(aa_values, vector_labels=aa_sentences[0] + ['ALA'], );

f, ((ALA, ARG, ASN, ASP), (CYS, GLU, GLN, GLY), (HIS, ILE, LEU, LYS), (MET, PHE, PRO, SER), (THR, TRP, TYR, VAL)) = plt.subplots(5,4, 
                                                    squeeze=True, sharex=True, sharey=True, 
                                                    figsize=(6.4*2,  4.4*2))

for aa,name,ax in zip(aas, aa_codes, (ALA, ARG, ASN, ASP, CYS, GLU, GLN, GLY, HIS, ILE, LEU, LYS, MET, PHE, PRO, SER, THR, TRP, TYR, VAL)):
    pca_subs = [get_values(x, projections) for x in mol2alt_sentence(aa, 1)]
    plot_2D_vectors(pca_subs, ax=ax, min_max_x=(-1000,1000), min_max_y=(-2000, 2000))
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    ax.text(0.95, 0.01, u"%s" % name,
            verticalalignment='bottom', horizontalalignment='right',
            transform=ax.transAxes, weight='bold',
            fontsize=10)

参考：

1. Jaeger S , Fulle S , Turk S . Mol2vec: Unsupervised Machine Learning Approach with Chemical Intuition[J]. Journal of Chemical Information and Modeling, 2017:acs.jcim.7b00616.

2. https://mol2vec.readthedocs.io/en/latest/

3. https://github.com/samoturk/mol2vec