AI实战:从入门到精通系列——用感知器实现情感分类(一)
AI实战:从入门到精通系列——用感知器实现情感分类(二)
用感知器实现情感分类
-
环境
- Ubuntu16.04 LTS
- python 3.x
- numpy
- gensim
- jieba
-
数据集
weibo_senti_100k 微博情感数据集
-
查看数据信息
import pandas as pd pd_all = pd.read_csv('./data/weibo_senti_100k.csv') print('评论数目(总体):%d' % pd_all.shape[0]) print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0]) print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0]) print(pd_all.sample(10))
-
输出
数据信息及样式:
-
数据集拆分为train、test
import os import pandas as pd from sklearn.model_selection import train_test_split from sklearn.utils import shuffle def train_valid_test_split(x_data, y_data, test_size=0.1, shuffle=True): x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, shuffle=shuffle) return x_train, x_test, y_train, y_test if __name__ == '__main__': pd_all = pd.read_csv("./data/weibo_senti_100k.csv") pd_all = shuffle(pd_all) x_data, y_data = pd_all.review, pd_all.label x_train, x_test, y_train, y_test = train_valid_test_split(x_data, y_data, 0.1) train = pd.DataFrame({'label':y_train, 'x_train': x_train}) train.to_csv("./data/train.csv", index=False, sep='\t') test = pd.DataFrame({'label':y_test, 'x_test': x_test}) test.to_csv("./data/test.csv", index=False, sep='\t')
-
结果如下
test.csv train.csv
-
生成词向量
def train_word2vec(train_path, test_path): import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence #加载数据 pd_all = pd.read_csv(train_path) train_data_x, train_data_y = pd_all.text.tolist(), pd_all.label.tolist() pd_all = pd.read_csv(test_path) test_data_x, test_data_y = pd_all.text.tolist(), pd_all.label.tolist() #分词 train_data_x = [segment(k) for k in train_data_x] test_data_x = [segment(k) for k in test_data_x] text = test_data_x + train_data_x input_file_name = './data/sentence.txt' open(input_file_name, 'w').write('\n'.join(text)) model = Word2Vec(LineSentence(input_file_name), word2vec_dimension = 100, # 词向量维度为100 window = 5, min_count = 1, workers = multiprocessing.cpu_count()) model.save('./data/word2vec.model') def segment(text): words = jieba.cut(text) words = list(words)[:max_text_len] return ' '.join(words)
执行:
train_path = './data/train.csv' test_path = './data/test.csv' train_word2vec(train_path, test_path)
输出:
./data/word2vec.model
./data/ word2vec.model.trainables.syn1neg.npy
./data/word2vec.model.wv.vectors.npy测试词向量:
from gensim.models import Word2Vec model = Word2Vec.load("./data/word2vec.model") vector = model.wv['好吃']
输出:
[-0.73283386 0.86413544 0.75687975 -0.5055297 -0.42515302 0.18348737 2.3790839 1.0248554 2.101729 -0.4618316 0.43203285 -0.5404889 -1.017284 -2.2938926 2.3901055 -0.69351804 1.6101934 -0.59710294 -0.03683157 0.57503146 -1.250644 2.980576 1.1501396 -0.81633765 0.6402967 2.3379786 -0.877263 -1.9016323 1.1057235 0.06841037 -0.05232436 -0.08345098 -0.30685595 -1.1040177 -1.7571559 -1.7910484 -0.7331836 0.1513317 -0.621015 0.8975967 2.5499363 1.1568907 0.3688762 -0.5182226 -0.30297205 0.5822141 -1.0808538 -0.01062215 -1.4400197 -2.2848194 2.1822946 0.15740396 1.0032029 -0.8410342 -1.1311824 -0.33163172 1.3151053 -0.2986618 1.9608823 -0.2602172 0.63158864 1.239699 0.10924603 -1.7023547 -1.554196 0.03117983 0.6561903 -0.4397268 -1.9914472 0.79718435 -1.4864717 -2.9809866 -0.46287113 0.4837672 -0.71872777 2.4697163 -0.53781223 0.23790799 2.0566401 1.6394123 -0.9129417 1.5896504 1.5701648 1.1339688 -1.8522842 2.0832975 -1.9120314 -0.23889321 2.8850334 0.70530176 1.6666977 -1.0355597 0.36848044 -0.02313641 -1.3314507 -0.52943283 0.29032257 -1.952622 -0.674098 -0.20572844]
-
数据处理
data_helper.pyimport numpy as np import pandas as pd import jieba import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence max_text_len = 50#50个词 word2vec_dimension = 100 def load_train_test_data(train_path, test_path, w2v_model_path): #加载数据 pd_all = pd.read_csv(train_path) train_data_x, train_data_y = pd_all.text.tolist(), pd_all.label.tolist() pd_all = pd.read_csv(test_path) test_data_x, test_data_y = pd_all.text.tolist(), pd_all.label.tolist() train_data_x = train_data_x[:5000]#数据太大了,只取5000个作为训练【我的电脑内存不足,故仅取部分数据作为训练集】 #分词 train_data_x = [segment(k) for k in train_data_x] test_data_x = [segment(k) for k in test_data_x] #文本转向量表示 w2v_model = Word2Vec.load(w2v_model_path) train_data_x = text_to_word2vec(w2v_model, train_data_x) test_data_x = text_to_word2vec(w2v_model, test_data_x) return train_data_x, test_data_x, train_data_y, test_data_y def text_to_word2vec(w2v_model, text_list): #文本转为向量表示 text_array = np.zeros((len(text_list), max_text_len, word2vec_dimension)) i = 0 while i < len(text_list): words = text_list[i].split(' ') for index, word in enumerate(words): if index >= max_text_len: break if word in w2v_model.wv: text_array[i, index] = w2v_model.wv[word] else: text_array[i, index] = [0.0]*word2vec_dimension i += 1 return text_array def segment(text): #分词 words = jieba.cut(text) words = list(words)[:max_text_len] return ' '.join(words)
- 说明:
【我的电脑内存不足,故仅取部分数据作为训练集】
删除下列这行代码即可:
train_data_x = train_data_x[:5000]#数据太大了,只取5000个作为训练
- 说明:
-
训练感知器
train.pyimport os, sys import data_helper import perception def train(train_path, test_path, w2v_model_path): x_train, x_test, y_train, y_test = data_helper.load_train_test_data(train_path, test_path, w2v_model_path) print(x_train[0], y_train[0]) p = perception.Perceptron(data_helper.max_text_len, data_helper.word2vec_dimension, perception.f) # 训练迭代100轮, 学习率为0.05 p.train(x_train, y_train, 100, 0.05) # 打印权重 p.print_weights() #测试 p.test(x_test, y_test) if __name__ == "__main__": train_path = './data/train.csv' test_path = './data/test.csv' w2v_model_path = './data/word2vec.model' train(train_path, test_path, w2v_model_path)
执行:python3 train.py
输出:
weights [[-1.42558697 -2.48767451 1.28752376 ... -0.78376229 3.16459166 -3.28389434] [-3.68905224 -4.80877013 -3.13396478 ... -4.25494364 -3.01798689 -4.91744347] [ 1.94075086 -1.94479774 5.51378438 ... -5.19175698 -4.50725763 3.28213941] ... [-0.60414949 0.84948442 3.2864892 ... -3.96489623 0.9902426 7.86129972] [-0.52215719 -2.85837685 -0.89045009 ... -1.01795905 -1.21213078 -0.16342622] [-0.12955836 -3.43814853 0.094599 ... -2.52779952 -4.71311826 -1.97031286]] bias 4.099999999999993 acc: 81.89%
- 结果说明
可以看到使用5000条数据训练微博情感分类的感知器模型,在5000条测试数据集上面的准确率为81.89%,效果还是可以的。
参考
训练词向量:https://radimrehurek.com/gensim/models/word2vec.html
上一篇:AI实战:从入门到精通系列——用感知器实现情感分类(一)
转载:https://blog.csdn.net/zengNLP/article/details/101156539
查看评论