自然语言处理(Natural Language Processing,简称NLP),是为各类企业及开发者提供的用于文本分析及挖掘的核心工具,旨在帮助用户高效的处理文本,已经广泛应用在电商、文娱、司法、公安、金融、医疗、电力等行业客户的多项业务中,取得了良好的效果。








  1. import pandas as pd
  2. #读入数据集
  3. data = pd.read_excel( "data.xls", encoding= 'gb18030')
  4. print( data.head())



  1. # 数据集的大小
  2. print( data.shape)
  3. # 数据集的列名
  4. print( data.columns.values)
  5. # 不同类别数据记录的统计
  6. print( data['Class'].value_counts())


  1. ( 8186, 2)
  2. array([ u'Comment', u'Class'], dtype=object)
  3. 1 3042
  4. -1 2657
  5. 0 2487
  6. Name: Class, dtype: int64




  1. # 导入中文分词库jieba
  2. import jieba
  3. import numpy as np


  1. cutted = []
  2. for row in data.values:
  3. try:
  4. raw_words = ( " ".join(jieba.cut(row[ 0])))
  5. cutted.append(raw_words)
  6. except AttributeError:
  7. print row[ 0]
  8. cutted.append( u"还行 一般吧")
  9. cutted_array = np.array(cutted)
  10. # 生成新数据文件,Comment字段为分词后的内容
  11. data_cutted = pd.DataFrame({
  12. 'Comment': cutted_array,
  13. 'Class': data[ 'Class']
  14. })





  1. # 导入第三方库wordcloud
  2. from wordcloud import WordCloud
  3. import matplotlib.pyplot as plt


  1. # 好评
  2. wc = WordCloud(font _path='Courier.ttf')
  3. wc.generate(''.join(data_cutted[ 'Comment'][ data_cutted['Class'] == 1]))
  4. plt.axis('off')
  5. plt.imshow(wc)
  6. plt.show()



  1. # 中评
  2. wc = WordCloud(font _path='Courier.ttf')
  3. wc.generate(''.join(data_cutted[ 'Comment'][ data_cutted['Class'] == 0]))
  4. plt.axis('off')
  5. plt.imshow(wc)
  6. plt.show()



  1. # 差评
  2. wc = WordCloud(font _path='Courier.ttf')
  3. wc.generate(''.join(data_cutted[ 'Comment'][ data_cutted['Class'] == -1]))
  4. plt.axis('off')
  5. plt.imshow(wc)
  6. plt.show()



  1. # 读入停用词文件
  2. import codecs
  3. with codecs.open( 'stopwords.txt', 'r', encoding= 'utf-8') as f:
  4. stopwords = [item.strip() for item in f]
  5. for item in stopwords[ 0: 200]:
  6. print(item,)



  1. #设定停用词文件,在统计关键词的时候,过滤停用词
  2. import jieba.analyse
  3. jieba.analyse.set_stop_words('stopwords.txt')


  1. # 好评关键词
  2. keywords _pos = jieba.analyse.extract_tags(''.join(data _cutted['Comment'][data_cutted['Class'] == 1]), topK=20)
  3. for item in keywords_pos:
  4. print(item,)


不错 正品 赠品 五分 发货 东西 满意 机子 喜欢 收到 很漂亮 充电 好评 很快 卖家 速度 评价 流畅 快递 物流


  1. #中评关键词
  2. keywords _med = jieba.analyse.extract_tags(''.join(data _cutted['Comment'][data_cutted['Class'] == 0]), topK=20)
  3. for item in keywords_med:
  4. print(item,)


充电 不错 发热 外观 感觉 电池 机子 问题 赠品 有点 无线 发烫 换货 软件 快递 安卓 内存 退货 知道 售后


  1. #差评关键词
  2. keywords _neg = jieba.analyse.extract_tags(''.join(data _cutted['Comment'][data_cutted['Class'] == -1]), topK=20)
  3. for item in keywords_neg:
  4. print(item,)


差评 售后 垃圾 赠品 退货 问题 换货 充电 降价 发票 充电器 东西 刚买 发热 无线 机子 死机 收到 质量 15








  • __init__为类的初始化函数,输入参数classifier_typevector_type,分别代表分类模型的类型和向量化方法的类型。
  • fit()函数,来实现向量化与模型建立的过程。


  1. # 实现向量化方法
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. #实现svm和贝叶斯模型
  5. from sklearn.svm import SVC
  6. from sklearn.svm import LinearSVC
  7. from sklearn.linear_model import SGDClassifier
  8. # 实现交叉验证
  9. from sklearn.cross_validation import train_test_split
  10. from sklearn.cross_validation import cross_val_score
  11. # 实现评价指标
  12. from sklearn import metrics
  13. # 文本情感分类的类:CommentClassifier
  14. class CommentClassifier:
  15. def __init__(self, classifier_type, vector_type):
  16. self.classifier_type = classifier_type #分类器类型:支持向量机或贝叶斯分类
  17. self.vector_type = vector_type #文本向量化模型:0\1模型,TF模型,TF-IDF模型
  18. def fit(self, train_x, train_y, max_df):
  19. list_text = list(train_x)
  20. #向量化方法:0 - 0/1,1 - TF,2 - TF-IDF
  21. if self.vector_type == 0:
  22. self.vectorizer = CountVectorizer(max_df, stop_words = stopwords, ngram_range=( 1, 3)).fit(list_text)
  23. elif self.vector_type == 1:
  24. self.vectorizer = TfidfVectorizer(max_df, stop_words = stopwords, ngram_range=( 1, 3), use_idf=False).fit(list_text)
  25. else:
  26. self.vectorizer = TfidfVectorizer(max_df, stop_words = stopwords, ngram_range=( 1, 3)).fit(list_text)
  27. self.array_trainx = self.vectorizer.transform(list_text)
  28. self.array_trainy = train_y
  29. #分类模型选择:1 - SVC,2 - LinearSVC,3 - SGDClassifier,三种SVM模型
  30. if self.classifier_type == 1:
  31. self.model = SVC(kernel= 'linear', gamma= 10 ** - 5, C= 1).fit( self.array_trainx, self.array_trainy)
  32. elif self.classifier_type == 2:
  33. self.model = LinearSVC().fit( self.array_trainx, self.array_trainy)
  34. else:
  35. self.model = SGDClassifier().fit( self.array_trainx, self.array_trainy)
  36. def predict_value(self, test_x):
  37. list_text = list(test_x)
  38. self.array_testx = self.vectorizer.transform(list_text)
  39. array_predict = self.model.predict( self.array_testx)
  40. return array_predict
  41. def predict_proba(self, test_x):
  42. list_text = list(test_x)
  43. self.array_testx = self.vectorizer.transform(list_text)
  44. array_score = self.model.predict_proba( self.array_testx)
  45. return array_score
  • 使用train_test_split()函数划分训练集和测试集。训练集:80%;测试集:20%。
  • 建立classifier_typevector_type两个参数的取值列表,来表示选择的向量化方法以及分类模型
  • 输出每种向量化方法和分类模型的组合所对应的分类评价结果,内容包括混淆矩阵以及含PrecisionRecallF1-score三个指标的评分矩阵


  1. #划分训练集,测试集
  2. train_x, test_x, train_y, test_y = train_test_split(data_cutted['Comment'].ravel().astype('U'), data_cutted['Class'].ravel(),
  3. test_size= 0. 2, random_state= 4)
  4. classifier_list = [1,2,3]
  5. vector_list = [0,1,2]
  6. for classifier_type in classifier_list:
  7. for vector_type in vector_list:
  8. commentCls = CommentClassifier(classifier_type, vector_type)
  9. #max_df 设置为0.98
  10. commentCls.fit(train_x, train_y, 0. 98)
  11. if classifier_type == 0:
  12. value_result = commentCls.predict_value(test_x)
  13. proba_result = commentCls.predict_proba(test_x)
  14. print(classifier_type,vector_type)
  15. print('classification report')
  16. print(metrics.classification_report(test_y, value_result, labels=[- 1, 0, 1]))
  17. print('confusion matrix')
  18. print(metrics.confusion_matrix(test_y, value_result, labels=[- 1, 0, 1]))
  19. else:
  20. value_result = commentCls.predict_value(test_x)
  21. print(classifier_type,vector_type)
  22. print('classification report')
  23. print(metrics.classification_report(test_y, value_result, labels=[- 1, 0, 1]))
  24. print('confusion matrix')
  25. print(metrics.confusion_matrix(test_y, value_result, labels=[- 1, 0, 1]))


  1. 1 0
  2. classification report
  3. precision recall f1-score support
  4. -1 0.68 0.62 0.65 519
  5. 0 0.55 0.49 0.52 485
  6. 1 0.75 0.86 0.80 634
  7. avg / total 0.67 0.68 0.67 1638
  8. confusion matrix
  9. [[ 324 130 65]
  10. [ 131 236 118]
  11. [ 24 64 546]]
  12. 1 1
  13. classification report
  14. precision recall f1-score support
  15. -1 0.71 0.74 0.72 519
  16. 0 0.58 0.54 0.56 485
  17. 1 0.84 0.85 0.85 634
  18. avg / total 0.72 0.72 0.72 1638
  19. confusion matrix
  20. [[ 385 109 25]
  21. [ 145 263 77]
  22. [ 15 80 539]]
  23. 1 2
  24. classification report
  25. precision recall f1-score support
  26. -1 0.70 0.74 0.72 519
  27. 0 0.58 0.52 0.55 485
  28. 1 0.84 0.86 0.85 634
  29. avg / total 0.72 0.72 0.72 1638
  30. confusion matrix
  31. [[ 386 106 27]
  32. [ 151 254 80]
  33. [ 14 76 544]]
  34. 2 0
  35. classification report
  36. precision recall f1-score support
  37. -1 0.70 0.62 0.66 519
  38. 0 0.56 0.51 0.54 485
  39. 1 0.76 0.88 0.82 634
  40. avg / total 0.68 0.69 0.68 1638
  41. confusion matrix
  42. [[ 320 135 64]
  43. [ 122 248 115]
  44. [ 16 57 561]]
  45. 2 1
  46. classification report
  47. precision recall f1-score support
  48. -1 0.69 0.73 0.71 519
  49. 0 0.61 0.48 0.54 485
  50. 1 0.81 0.91 0.86 634
  51. avg / total 0.71 0.73 0.72 1638
  52. confusion matrix
  53. [[ 377 108 34]
  54. [ 154 233 98]
  55. [ 12 44 578]]
  56. 2 2
  57. classification report
  58. precision recall f1-score support
  59. -1 0.70 0.74 0.72 519
  60. 0 0.61 0.50 0.55 485
  61. 1 0.83 0.91 0.87 634
  62. avg / total 0.72 0.73 0.73 1638
  63. confusion matrix
  64. [[ 383 108 28]
  65. [ 154 241 90]
  66. [ 13 43 578]]
  67. 3 0
  68. classification report
  69. precision recall f1-score support
  70. -1 0.69 0.69 0.69 519
  71. 0 0.58 0.47 0.52 485
  72. 1 0.79 0.90 0.84 634
  73. avg / total 0.70 0.71 0.70 1638
  74. confusion matrix
  75. [[ 359 118 42]
  76. [ 148 228 109]
  77. [ 14 47 573]]
  78. 3 1
  79. classification report
  80. precision recall f1-score support
  81. -1 0.70 0.74 0.72 519
  82. 0 0.60 0.49 0.54 485
  83. 1 0.81 0.88 0.84 634
  84. avg / total 0.71 0.72 0.71 1638
  85. confusion matrix
  86. [[ 386 96 37]
  87. [ 152 240 93]
  88. [ 13 66 555]]
  89. 3 2
  90. classification report
  91. precision recall f1-score support
  92. -1 0.65 0.75 0.69 519
  93. 0 0.63 0.49 0.55 485
  94. 1 0.83 0.86 0.85 634
  95. avg / total 0.71 0.72 0.71 1638
  96. confusion matrix
  97. [[ 389 98 32]
  98. [ 169 236 80]
  99. [ 45 41 548]]



  1. data_bi = data_cutted[data_cutted[ 'Class'] != 0]
  2. data_bi[ 'Class'].value_counts()


  1. 1 3042
  2. -1 2658
  3. Name: Class, dtype: int64


  1. 1 0
  2. classification report
  3. precision recall f1-score support
  4. -1 0.90 0.79 0.84 537
  5. 1 0.83 0.92 0.87 603
  6. avg / total 0.86 0.86 0.86 1140
  7. confusion matrix
  8. [[ 425 112]
  9. [ 48 555]]
  10. 1 1
  11. classification report
  12. precision recall f1-score support
  13. -1 0.87 0.92 0.90 537
  14. 1 0.93 0.88 0.90 603
  15. avg / total 0.90 0.90 0.90 1140
  16. confusion matrix
  17. [[ 496 41]
  18. [ 71 532]]
  19. 1 2
  20. classification report
  21. precision recall f1-score support
  22. -1 0.88 0.93 0.90 537
  23. 1 0.93 0.88 0.91 603
  24. avg / total 0.90 0.90 0.90 1140
  25. confusion matrix
  26. [[ 497 40]
  27. [ 70 533]]
  28. 2 0
  29. classification report
  30. precision recall f1-score support
  31. -1 0.90 0.80 0.85 537
  32. 1 0.84 0.92 0.88 603
  33. avg / total 0.87 0.86 0.86 1140
  34. confusion matrix
  35. [[ 431 106]
  36. [ 48 555]]
  37. 2 1
  38. classification report
  39. precision recall f1-score support
  40. -1 0.92 0.91 0.91 537
  41. 1 0.92 0.93 0.92 603
  42. avg / total 0.92 0.92 0.92 1140
  43. confusion matrix
  44. [[ 486 51]
  45. [ 43 560]]
  46. 2 2
  47. classification report
  48. precision recall f1-score support
  49. -1 0.93 0.91 0.92 537
  50. 1 0.92 0.94 0.93 603
  51. avg / total 0.92 0.92 0.92 1140
  52. confusion matrix
  53. [[ 488 49]
  54. [ 39 564]]
  55. 3 0
  56. classification report
  57. precision recall f1-score support
  58. -1 0.92 0.82 0.87 537
  59. 1 0.86 0.94 0.90 603
  60. avg / total 0.89 0.88 0.88 1140
  61. confusion matrix
  62. [[ 443 94]
  63. [ 38 565]]
  64. 3 1
  65. classification report
  66. precision recall f1-score support
  67. -1 0.92 0.91 0.91 537
  68. 1 0.92 0.93 0.92 603
  69. avg / total 0.92 0.92 0.92 1140
  70. confusion matrix
  71. [[ 486 51]
  72. [ 41 562]]
  73. 3 2
  74. classification report
  75. precision recall f1-score support
  76. -1 0.88 0.93 0.90 537
  77. 1 0.93 0.89 0.91 603
  78. avg / total 0.91 0.91 0.91 1140
  79. confusion matrix
  80. [[ 497 40]
  81. [ 67 536]]








  • 好评:快 就是 手感 满意 也好 喜欢 也 流畅 很 服务态度 实用 超快 挺快 用着 速度 礼品 也不错 非常好 挺好 感觉 才来 还行 好看 也快 不错的 送了 非常不错 超级 赞 好多东西 很实用 各方面 挺好的 很多 漂亮 配件 还不错 也多 特意 慢 满分 好用 非常漂亮......
  • 差评:不多说 上当 差差 刚用 服务差 一点也不 不要 简直 还是去 实体店 大家 保证 不肯 生气 开发票 磨损 后悔 印记 网 什么破 烂烂 左边 失效 太 骗 掉价 走下坡路 不说了 彻底 三星手机 自营 几次 真心 别的 看完 简单说 机会 这是 生气了 触动 缝隙 冲动了 失望......



  1. import pandas as pd
  2. from gensim.models import Doc2Vec
  3. from gensim.models.doc2vec import TaggedDocument
  4. import logging
  5. logging.basicConfig(format= '%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  6. train_x = data_bi[ 'Comment'].ravel()
  7. train_y = data_bi[ 'Class'].ravel()
  8. #为train_x列贴上标签"TRAIN"
  9. def labelizeReviews(reviews, label_type):
  10. labelized = []
  11. for i, v in enumerate(reviews):
  12. label = '%s_%s' % (label_type, i)
  13. labelized.append(TaggedDocument(v.split( " "), [label]))
  14. return labelized
  15. train_x = labelizeReviews(train_x, "TRAIN")
  16. #建立Doc2Vec模型model
  17. size = 300
  18. all_data = []
  19. all_data.extend(train_x)
  20. model = Doc2Vec(min_count= 1, window= 8, size=size, sample= 1e-4, negative= 5, hs= 0, iter= 5, workers= 8)
  21. model.build_vocab(all_data)
  22. # 设置迭代次数10
  23. for epoch in range( 10):
  24. model.train(train_x)
  25. #建立空列表pos和neg以对相似度计算结果进行存储,计算每个评论和极好评论之间的余弦距离,并存在pos列表中
  26. #计算每个评论和极差评论之间的余弦距离,并存在neg列表中
  27. pos = []
  28. neg = []
  29. for i in range( 0,len(train_x)):
  30. pos.append(model.docvecs.similarity( "TRAIN_0", "TRAIN_{}".format(i)))
  31. neg.append(model.docvecs.similarity( "TRAIN_1", "TRAIN_{}".format(i)))
  32. #将pos列表和neg列表更新到原始数据文件中,分别表示为字段PosSim和字段NegSim
  33. data_bi[ u'PosSim'] = pos
  34. data_bi[ u'NegSim'] = neg


  1. 2017- 05- 27 14: 30: 28, 393 : INFO : collecting all words and their counts
  2. 2017- 05- 27 14: 30: 28, 394 : INFO : PROGRESS: at example # 0, processed 0 words ( 0/s), 0 word types, 0 tags
  3. 2017- 05- 27 14: 30: 28, 593 : INFO : collected 10545 word types and 5700 unique tags from a corpus of 5700 examples and 482148 words
  4. 2017- 05- 27 14: 30: 28, 595 : INFO : Loading a fresh vocabulary
  5. 2017- 05- 27 14: 30: 28, 649 : INFO : min_count= 1 retains 10545 unique words ( 100% of original 10545, drops 0)
  6. 2017- 05- 27 14: 30: 28, 650 : INFO : min_count= 1 leaves 482148 word corpus ( 100% of original 482148, drops 0)
  7. 2017- 05- 27 14: 30: 28, 705 : INFO : deleting the raw counts dictionary of 10545 items
  8. 2017- 05- 27 14: 30: 28, 706 : INFO : sample= 0. 0001 downsamples 217 most-common words
  9. 2017- 05- 27 14: 30: 28, 707 : INFO : downsampling leaves estimated 108356 word corpus ( 22. 5% of prior 482148)
  10. 2017- 05- 27 14: 30: 28, 709 : INFO : estimated required memory for 10545 words and 300 dimensions: 38560500 bytes
  11. 2017- 05- 27 14: 30: 28, 784 : INFO : resetting layer weights
  12. 2017- 05- 27 14: 30: 29, 120 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  13. 2017- 05- 27 14: 30: 29, 121 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  14. 2017- 05- 27 14: 30: 30, 176 : INFO : PROGRESS: at 10. 24% examples, 72316 words/s, in_qsize 15, out_qsize 0
  15. 2017- 05- 27 14: 30: 31, 211 : INFO : PROGRESS: at 29. 96% examples, 91057 words/s, in_qsize 16, out_qsize 0
  16. 2017- 05- 27 14: 30: 32, 218 : INFO : PROGRESS: at 66. 30% examples, 126742 words/s, in_qsize 15, out_qsize 0
  17. 2017- 05- 27 14: 30: 33, 231 : INFO : PROGRESS: at 86. 00% examples, 122698 words/s, in_qsize 15, out_qsize 0
  18. 2017- 05- 27 14: 30: 33, 571 : INFO : worker thread finished; awaiting finish of 7 more threads
  19. 2017- 05- 27 14: 30: 33, 573 : INFO : worker thread finished; awaiting finish of 6 more threads
  20. 2017- 05- 27 14: 30: 33, 605 : INFO : worker thread finished; awaiting finish of 5 more threads
  21. 2017- 05- 27 14: 30: 33, 647 : INFO : worker thread finished; awaiting finish of 4 more threads
  22. 2017- 05- 27 14: 30: 33, 678 : INFO : worker thread finished; awaiting finish of 3 more threads
  23. 2017- 05- 27 14: 30: 33, 696 : INFO : worker thread finished; awaiting finish of 2 more threads
  24. 2017- 05- 27 14: 30: 33, 711 : INFO : worker thread finished; awaiting finish of 1 more threads
  25. 2017- 05- 27 14: 30: 33, 722 : INFO : worker thread finished; awaiting finish of 0 more threads
  26. 2017- 05- 27 14: 30: 33, 724 : INFO : training on 2410740 raw words ( 570332 effective words) took 4. 6s, 124032 effective words/s
  27. 2017- 05- 27 14: 30: 33, 727 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  28. 2017- 05- 27 14: 30: 33, 731 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  29. 2017- 05- 27 14: 30: 34, 753 : INFO : PROGRESS: at 36. 38% examples, 212225 words/s, in_qsize 15, out_qsize 0
  30. 2017- 05- 27 14: 30: 35, 762 : INFO : PROGRESS: at 75. 24% examples, 216859 words/s, in_qsize 16, out_qsize 0
  31. 2017- 05- 27 14: 30: 36, 243 : INFO : worker thread finished; awaiting finish of 7 more threads
  32. 2017- 05- 27 14: 30: 36, 244 : INFO : worker thread finished; awaiting finish of 6 more threads
  33. 2017- 05- 27 14: 30: 36, 264 : INFO : worker thread finished; awaiting finish of 5 more threads
  34. 2017- 05- 27 14: 30: 36, 306 : INFO : worker thread finished; awaiting finish of 4 more threads
  35. 2017- 05- 27 14: 30: 36, 311 : INFO : worker thread finished; awaiting finish of 3 more threads
  36. 2017- 05- 27 14: 30: 36, 320 : INFO : worker thread finished; awaiting finish of 2 more threads
  37. 2017- 05- 27 14: 30: 36, 330 : INFO : worker thread finished; awaiting finish of 1 more threads
  38. 2017- 05- 27 14: 30: 36, 336 : INFO : worker thread finished; awaiting finish of 0 more threads
  39. 2017- 05- 27 14: 30: 36, 338 : INFO : training on 2410740 raw words ( 570008 effective words) took 2. 6s, 219523 effective words/s
  40. 2017- 05- 27 14: 30: 36, 339 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  41. 2017- 05- 27 14: 30: 36, 341 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  42. 2017- 05- 27 14: 30: 37, 353 : INFO : PROGRESS: at 28. 23% examples, 177496 words/s, in_qsize 16, out_qsize 0
  43. 2017- 05- 27 14: 30: 38, 372 : INFO : PROGRESS: at 66. 30% examples, 193880 words/s, in_qsize 16, out_qsize 0
  44. 2017- 05- 27 14: 30: 39, 061 : INFO : worker thread finished; awaiting finish of 7 more threads
  45. 2017- 05- 27 14: 30: 39, 062 : INFO : worker thread finished; awaiting finish of 6 more threads
  46. 2017- 05- 27 14: 30: 39, 074 : INFO : worker thread finished; awaiting finish of 5 more threads
  47. 2017- 05- 27 14: 30: 39, 115 : INFO : worker thread finished; awaiting finish of 4 more threads
  48. 2017- 05- 27 14: 30: 39, 122 : INFO : worker thread finished; awaiting finish of 3 more threads
  49. 2017- 05- 27 14: 30: 39, 132 : INFO : worker thread finished; awaiting finish of 2 more threads
  50. 2017- 05- 27 14: 30: 39, 147 : INFO : worker thread finished; awaiting finish of 1 more threads
  51. 2017- 05- 27 14: 30: 39, 154 : INFO : worker thread finished; awaiting finish of 0 more threads
  52. 2017- 05- 27 14: 30: 39, 155 : INFO : training on 2410740 raw words ( 570746 effective words) took 2. 8s, 203312 effective words/s
  53. 2017- 05- 27 14: 30: 39, 158 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  54. 2017- 05- 27 14: 30: 39, 159 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  55. 2017- 05- 27 14: 30: 40, 168 : INFO : PROGRESS: at 37. 74% examples, 222816 words/s, in_qsize 16, out_qsize 0
  56. 2017- 05- 27 14: 30: 41, 177 : INFO : PROGRESS: at 77. 55% examples, 223202 words/s, in_qsize 16, out_qsize 0
  57. 2017- 05- 27 14: 30: 41, 605 : INFO : worker thread finished; awaiting finish of 7 more threads
  58. 2017- 05- 27 14: 30: 41, 610 : INFO : worker thread finished; awaiting finish of 6 more threads
  59. 2017- 05- 27 14: 30: 41, 614 : INFO : worker thread finished; awaiting finish of 5 more threads
  60. 2017- 05- 27 14: 30: 41, 645 : INFO : worker thread finished; awaiting finish of 4 more threads
  61. 2017- 05- 27 14: 30: 41, 670 : INFO : worker thread finished; awaiting finish of 3 more threads
  62. 2017- 05- 27 14: 30: 41, 674 : INFO : worker thread finished; awaiting finish of 2 more threads
  63. 2017- 05- 27 14: 30: 41, 682 : INFO : worker thread finished; awaiting finish of 1 more threads
  64. 2017- 05- 27 14: 30: 41, 690 : INFO : worker thread finished; awaiting finish of 0 more threads
  65. 2017- 05- 27 14: 30: 41, 692 : INFO : training on 2410740 raw words ( 569889 effective words) took 2. 5s, 225457 effective words/s
  66. 2017- 05- 27 14: 30: 41, 694 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  67. 2017- 05- 27 14: 30: 41, 696 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  68. 2017- 05- 27 14: 30: 42, 712 : INFO : PROGRESS: at 29. 16% examples, 183182 words/s, in_qsize 15, out_qsize 0
  69. 2017- 05- 27 14: 30: 43, 754 : INFO : PROGRESS: at 69. 96% examples, 203560 words/s, in_qsize 15, out_qsize 0
  70. 2017- 05- 27 14: 30: 44, 804 : INFO : PROGRESS: at 91. 97% examples, 173787 words/s, in_qsize 14, out_qsize 0
  71. 2017- 05- 27 14: 30: 44, 973 : INFO : worker thread finished; awaiting finish of 7 more threads
  72. 2017- 05- 27 14: 30: 44, 989 : INFO : worker thread finished; awaiting finish of 6 more threads
  73. 2017- 05- 27 14: 30: 45, 028 : INFO : worker thread finished; awaiting finish of 5 more threads
  74. 2017- 05- 27 14: 30: 45, 061 : INFO : worker thread finished; awaiting finish of 4 more threads
  75. 2017- 05- 27 14: 30: 45, 097 : INFO : worker thread finished; awaiting finish of 3 more threads
  76. 2017- 05- 27 14: 30: 45, 101 : INFO : worker thread finished; awaiting finish of 2 more threads
  77. 2017- 05- 27 14: 30: 45, 121 : INFO : worker thread finished; awaiting finish of 1 more threads
  78. 2017- 05- 27 14: 30: 45, 125 : INFO : worker thread finished; awaiting finish of 0 more threads
  79. 2017- 05- 27 14: 30: 45, 128 : INFO : training on 2410740 raw words ( 569903 effective words) took 3. 4s, 166370 effective words/s
  80. 2017- 05- 27 14: 30: 45, 131 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  81. 2017- 05- 27 14: 30: 45, 132 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  82. 2017- 05- 27 14: 30: 46, 152 : INFO : PROGRESS: at 11. 26% examples, 79348 words/s, in_qsize 16, out_qsize 0
  83. 2017- 05- 27 14: 30: 47, 153 : INFO : PROGRESS: at 27. 52% examples, 85992 words/s, in_qsize 16, out_qsize 0
  84. 2017- 05- 27 14: 30: 48, 166 : INFO : PROGRESS: at 66. 47% examples, 130273 words/s, in_qsize 15, out_qsize 0
  85. 2017- 05- 27 14: 30: 49, 061 : INFO : worker thread finished; awaiting finish of 7 more threads
  86. 2017- 05- 27 14: 30: 49, 076 : INFO : worker thread finished; awaiting finish of 6 more threads
  87. 2017- 05- 27 14: 30: 49, 088 : INFO : worker thread finished; awaiting finish of 5 more threads
  88. 2017- 05- 27 14: 30: 49, 123 : INFO : worker thread finished; awaiting finish of 4 more threads
  89. 2017- 05- 27 14: 30: 49, 144 : INFO : worker thread finished; awaiting finish of 3 more threads
  90. 2017- 05- 27 14: 30: 49, 147 : INFO : worker thread finished; awaiting finish of 2 more threads
  91. 2017- 05- 27 14: 30: 49, 152 : INFO : worker thread finished; awaiting finish of 1 more threads
  92. 2017- 05- 27 14: 30: 49, 159 : INFO : worker thread finished; awaiting finish of 0 more threads
  93. 2017- 05- 27 14: 30: 49, 160 : INFO : training on 2410740 raw words ( 570333 effective words) took 4. 0s, 141860 effective words/s
  94. 2017- 05- 27 14: 30: 49, 161 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  95. 2017- 05- 27 14: 30: 49, 163 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  96. 2017- 05- 27 14: 30: 50, 185 : INFO : PROGRESS: at 31. 78% examples, 193530 words/s, in_qsize 15, out_qsize 0
  97. 2017- 05- 27 14: 30: 51, 244 : INFO : PROGRESS: at 48. 51% examples, 141817 words/s, in_qsize 15, out_qsize 0
  98. 2017- 05- 27 14: 30: 52, 278 : INFO : PROGRESS: at 69. 96% examples, 134399 words/s, in_qsize 16, out_qsize 0
  99. 2017- 05- 27 14: 30: 52, 918 : INFO : worker thread finished; awaiting finish of 7 more threads
  100. 2017- 05- 27 14: 30: 52, 936 : INFO : worker thread finished; awaiting finish of 6 more threads
  101. 2017- 05- 27 14: 30: 52, 945 : INFO : worker thread finished; awaiting finish of 5 more threads
  102. 2017- 05- 27 14: 30: 52, 976 : INFO : worker thread finished; awaiting finish of 4 more threads
  103. 2017- 05- 27 14: 30: 52, 979 : INFO : worker thread finished; awaiting finish of 3 more threads
  104. 2017- 05- 27 14: 30: 52, 984 : INFO : worker thread finished; awaiting finish of 2 more threads
  105. 2017- 05- 27 14: 30: 52, 995 : INFO : worker thread finished; awaiting finish of 1 more threads
  106. 2017- 05- 27 14: 30: 52, 998 : INFO : worker thread finished; awaiting finish of 0 more threads
  107. 2017- 05- 27 14: 30: 52, 999 : INFO : training on 2410740 raw words ( 570031 effective words) took 3. 8s, 148864 effective words/s
  108. 2017- 05- 27 14: 30: 53, 000 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  109. 2017- 05- 27 14: 30: 53, 002 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  110. 2017- 05- 27 14: 30: 54, 024 : INFO : PROGRESS: at 34. 48% examples, 202424 words/s, in_qsize 15, out_qsize 0
  111. 2017- 05- 27 14: 30: 55, 035 : INFO : PROGRESS: at 68. 58% examples, 201499 words/s, in_qsize 15, out_qsize 0
  112. 2017- 05- 27 14: 30: 56, 010 : INFO : worker thread finished; awaiting finish of 7 more threads
  113. 2017- 05- 27 14: 30: 56, 017 : INFO : worker thread finished; awaiting finish of 6 more threads
  114. 2017- 05- 27 14: 30: 56, 048 : INFO : PROGRESS: at 96. 89% examples, 183861 words/s, in_qsize 5, out_qsize 1
  115. 2017- 05- 27 14: 30: 56, 049 : INFO : worker thread finished; awaiting finish of 5 more threads
  116. 2017- 05- 27 14: 30: 56, 071 : INFO : worker thread finished; awaiting finish of 4 more threads
  117. 2017- 05- 27 14: 30: 56, 084 : INFO : worker thread finished; awaiting finish of 3 more threads
  118. 2017- 05- 27 14: 30: 56, 099 : INFO : worker thread finished; awaiting finish of 2 more threads
  119. 2017- 05- 27 14: 30: 56, 101 : INFO : worker thread finished; awaiting finish of 1 more threads
  120. 2017- 05- 27 14: 30: 56, 104 : INFO : worker thread finished; awaiting finish of 0 more threads
  121. 2017- 05- 27 14: 30: 56, 104 : INFO : training on 2410740 raw words ( 570328 effective words) took 3. 1s, 184129 effective words/s
  122. 2017- 05- 27 14: 30: 56, 105 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  123. 2017- 05- 27 14: 30: 56, 107 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  124. 2017- 05- 27 14: 30: 57, 134 : INFO : PROGRESS: at 33. 13% examples, 197730 words/s, in_qsize 15, out_qsize 0
  125. 2017- 05- 27 14: 30: 58, 140 : INFO : PROGRESS: at 69. 96% examples, 206423 words/s, in_qsize 15, out_qsize 0
  126. 2017- 05- 27 14: 30: 58, 876 : INFO : worker thread finished; awaiting finish of 7 more threads
  127. 2017- 05- 27 14: 30: 58, 883 : INFO : worker thread finished; awaiting finish of 6 more threads
  128. 2017- 05- 27 14: 30: 58, 889 : INFO : worker thread finished; awaiting finish of 5 more threads
  129. 2017- 05- 27 14: 30: 58, 937 : INFO : worker thread finished; awaiting finish of 4 more threads
  130. 2017- 05- 27 14: 30: 58, 949 : INFO : worker thread finished; awaiting finish of 3 more threads
  131. 2017- 05- 27 14: 30: 58, 953 : INFO : worker thread finished; awaiting finish of 2 more threads
  132. 2017- 05- 27 14: 30: 58, 960 : INFO : worker thread finished; awaiting finish of 1 more threads
  133. 2017- 05- 27 14: 30: 58, 967 : INFO : worker thread finished; awaiting finish of 0 more threads
  134. 2017- 05- 27 14: 30: 58, 968 : INFO : training on 2410740 raw words ( 570312 effective words) took 2. 9s, 199922 effective words/s
  135. 2017- 05- 27 14: 30: 58, 969 : INFO : training model with 8 workers on 10545 vocabulary and 300 features, using sg= 0 hs= 0 sample= 0. 0001 negative= 5 window= 8
  136. 2017- 05- 27 14: 30: 58, 970 : INFO : expecting 5700 sentences, matching count from corpus used for vocabulary survey
  137. 2017- 05- 27 14: 30: 59, 991 : INFO : PROGRESS: at 32. 86% examples, 198045 words/s, in_qsize 16, out_qsize 0
  138. 2017- 05- 27 14: 31: 00, 993 : INFO : PROGRESS: at 68. 23% examples, 201443 words/s, in_qsize 16, out_qsize 0
  139. 2017- 05- 27 14: 31: 01, 881 : INFO : worker thread finished; awaiting finish of 7 more threads
  140. 2017- 05- 27 14: 31: 01, 888 : INFO : worker thread finished; awaiting finish of 6 more threads
  141. 2017- 05- 27 14: 31: 01, 907 : INFO : worker thread finished; awaiting finish of 5 more threads
  142. 2017- 05- 27 14: 31: 01, 922 : INFO : worker thread finished; awaiting finish of 4 more threads
  143. 2017- 05- 27 14: 31: 01, 941 : INFO : worker thread finished; awaiting finish of 3 more threads
  144. 2017- 05- 27 14: 31: 01, 948 : INFO : worker thread finished; awaiting finish of 2 more threads
  145. 2017- 05- 27 14: 31: 01, 955 : INFO : worker thread finished; awaiting finish of 1 more threads
  146. 2017- 05- 27 14: 31: 01, 961 : INFO : worker thread finished; awaiting finish of 0 more threads
  147. 2017- 05- 27 14: 31: 01, 962 : INFO : training on 2410740 raw words ( 570826 effective words) took 3. 0s, 191072 effective words/s


  1. from matplotlib import pyplot as plt
  2. label= data_bi[ 'Class'].ravel()
  3. values = data_bi[[ 'PosSim' , 'NegSim']].values
  4. plt.scatter(values[:, 0], values[:, 1], c= label, alpha= 0.4)
  5. plt.show()




  • 将数据集映射到了极低维度的空间,只有二维
  • 一种无监督的学习方法,不需要对原始训练数据进行标注
  • 具有普适性,在其他领域也可以用这种方法,只需要先找出该领域极其正和极其负的方法,将其与所有待识别的数据通过doc2vec转化为向量计算距离即可

