小言_互联网的博客

AI实战:从入门到精通系列——用感知器实现情感分类(二)

526人阅读  评论(0)

AI实战:从入门到精通系列——用感知器实现情感分类(一)
AI实战:从入门到精通系列——用感知器实现情感分类(二)



用感知器实现情感分类


  • 环境

    • Ubuntu16.04 LTS
    • python 3.x
    • numpy
    • gensim
    • jieba

  • 数据集

    weibo_senti_100k 微博情感数据集

  • 查看数据信息

    import pandas as pd
    
    
    pd_all = pd.read_csv('./data/weibo_senti_100k.csv')
    
    print('评论数目(总体):%d' % pd_all.shape[0])
    print('评论数目(正向):%d' % pd_all[pd_all.label==1].shape[0])
    print('评论数目(负向):%d' % pd_all[pd_all.label==0].shape[0])
    
    print(pd_all.sample(10))
    
    
  • 输出
    数据信息及样式:

  • 数据集拆分为train、test

    import os
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
     
    def train_valid_test_split(x_data, y_data, test_size=0.1, shuffle=True):
        x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, shuffle=shuffle)
        
        return x_train, x_test, y_train, y_test
     
    if __name__ == '__main__':
        pd_all = pd.read_csv("./data/weibo_senti_100k.csv")
        pd_all = shuffle(pd_all)
        x_data, y_data = pd_all.review, pd_all.label
     
        x_train, x_test, y_train, y_test = train_valid_test_split(x_data, y_data, 0.1)
     
        train = pd.DataFrame({'label':y_train, 'x_train': x_train})
        train.to_csv("./data/train.csv", index=False, sep='\t')
        test = pd.DataFrame({'label':y_test, 'x_test': x_test})
        test.to_csv("./data/test.csv", index=False, sep='\t')
    
  • 结果如下

    test.csv
    train.csv
    
  • 生成词向量

    def train_word2vec(train_path, test_path):
        import multiprocessing
        from gensim.models import Word2Vec
        from gensim.models.word2vec import LineSentence
    
        #加载数据
        pd_all = pd.read_csv(train_path)
        train_data_x, train_data_y = pd_all.text.tolist(), pd_all.label.tolist()
        
        pd_all = pd.read_csv(test_path)
        test_data_x, test_data_y = pd_all.text.tolist(), pd_all.label.tolist()
        
        #分词
        train_data_x = [segment(k) for k in train_data_x]
        test_data_x = [segment(k) for k in test_data_x]
        
        text = test_data_x + train_data_x
        
        
        input_file_name = './data/sentence.txt'
        open(input_file_name, 'w').write('\n'.join(text))
    
        model = Word2Vec(LineSentence(input_file_name),
                         word2vec_dimension = 100, # 词向量维度为100
                         window = 5,
                         min_count = 1,
                         workers = multiprocessing.cpu_count())
    
        model.save('./data/word2vec.model')
    
    def segment(text):
        words = jieba.cut(text)
        words = list(words)[:max_text_len]
        return ' '.join(words)
    

    执行:

    train_path = './data/train.csv'
    test_path = './data/test.csv'
    train_word2vec(train_path, test_path)
    

    输出:
    ./data/word2vec.model
    ./data/ word2vec.model.trainables.syn1neg.npy
    ./data/word2vec.model.wv.vectors.npy

    测试词向量:

    from gensim.models import Word2Vec
    model = Word2Vec.load("./data/word2vec.model")
    vector = model.wv['好吃']
    

    输出:

    [-0.73283386  0.86413544  0.75687975 -0.5055297  -0.42515302  0.18348737
      2.3790839   1.0248554   2.101729   -0.4618316   0.43203285 -0.5404889
     -1.017284   -2.2938926   2.3901055  -0.69351804  1.6101934  -0.59710294
     -0.03683157  0.57503146 -1.250644    2.980576    1.1501396  -0.81633765
      0.6402967   2.3379786  -0.877263   -1.9016323   1.1057235   0.06841037
     -0.05232436 -0.08345098 -0.30685595 -1.1040177  -1.7571559  -1.7910484
     -0.7331836   0.1513317  -0.621015    0.8975967   2.5499363   1.1568907
      0.3688762  -0.5182226  -0.30297205  0.5822141  -1.0808538  -0.01062215
     -1.4400197  -2.2848194   2.1822946   0.15740396  1.0032029  -0.8410342
     -1.1311824  -0.33163172  1.3151053  -0.2986618   1.9608823  -0.2602172
      0.63158864  1.239699    0.10924603 -1.7023547  -1.554196    0.03117983
      0.6561903  -0.4397268  -1.9914472   0.79718435 -1.4864717  -2.9809866
     -0.46287113  0.4837672  -0.71872777  2.4697163  -0.53781223  0.23790799
      2.0566401   1.6394123  -0.9129417   1.5896504   1.5701648   1.1339688
     -1.8522842   2.0832975  -1.9120314  -0.23889321  2.8850334   0.70530176
      1.6666977  -1.0355597   0.36848044 -0.02313641 -1.3314507  -0.52943283
      0.29032257 -1.952622   -0.674098   -0.20572844]
    

  • 数据处理
    data_helper.py

    import numpy as np
    import pandas as pd
    import jieba
    import multiprocessing
    from gensim.models import Word2Vec
    from gensim.models.word2vec import LineSentence
    
    max_text_len = 50#50个词
    word2vec_dimension = 100
    
    def load_train_test_data(train_path, test_path, w2v_model_path):
        
        #加载数据
        pd_all = pd.read_csv(train_path)
        train_data_x, train_data_y = pd_all.text.tolist(), pd_all.label.tolist()
        
        pd_all = pd.read_csv(test_path)
        test_data_x, test_data_y = pd_all.text.tolist(), pd_all.label.tolist()
        
        train_data_x = train_data_x[:5000]#数据太大了,只取5000个作为训练【我的电脑内存不足,故仅取部分数据作为训练集】
        
        #分词
        train_data_x = [segment(k) for k in train_data_x]
        test_data_x = [segment(k) for k in test_data_x]
        
        
        #文本转向量表示
        w2v_model = Word2Vec.load(w2v_model_path)
        train_data_x = text_to_word2vec(w2v_model, train_data_x)
        test_data_x = text_to_word2vec(w2v_model, test_data_x)
        
        return train_data_x, test_data_x, train_data_y, test_data_y
    
    def text_to_word2vec(w2v_model, text_list):
        #文本转为向量表示
        
        text_array = np.zeros((len(text_list), max_text_len, word2vec_dimension))
        i = 0
        while i < len(text_list):
            words = text_list[i].split(' ')
            for index, word in enumerate(words):
                if index >= max_text_len:
                    break
                if word in w2v_model.wv:
                    text_array[i, index] = w2v_model.wv[word]
                else:
                    text_array[i, index] = [0.0]*word2vec_dimension
            i += 1
        
        return text_array
        
    def segment(text):
    	#分词
    	
        words = jieba.cut(text)
        words = list(words)[:max_text_len]
        
        return ' '.join(words)
    
    • 说明:
      【我的电脑内存不足,故仅取部分数据作为训练集】
      删除下列这行代码即可:
      train_data_x = train_data_x[:5000]#数据太大了,只取5000个作为训练

  • 训练感知器
    train.py

    import os, sys
    import data_helper
    import perception
    
    
    def train(train_path, test_path, w2v_model_path):
        
        x_train, x_test, y_train, y_test = data_helper.load_train_test_data(train_path, test_path, w2v_model_path)
        print(x_train[0], y_train[0])
        
        p = perception.Perceptron(data_helper.max_text_len, data_helper.word2vec_dimension, perception.f)
        
        # 训练迭代100轮, 学习率为0.05
        p.train(x_train, y_train, 100, 0.05)
        
        # 打印权重
        p.print_weights()
        
        #测试
        p.test(x_test, y_test)
        
    
    if __name__ == "__main__":  
        train_path = './data/train.csv'
        test_path = './data/test.csv'
        w2v_model_path = './data/word2vec.model'
        train(train_path, test_path, w2v_model_path)
    

    执行:python3 train.py

    输出:

    weights [[-1.42558697 -2.48767451  1.28752376 ... -0.78376229  3.16459166
      -3.28389434]
     [-3.68905224 -4.80877013 -3.13396478 ... -4.25494364 -3.01798689
      -4.91744347]
     [ 1.94075086 -1.94479774  5.51378438 ... -5.19175698 -4.50725763
       3.28213941]
     ...
     [-0.60414949  0.84948442  3.2864892  ... -3.96489623  0.9902426
       7.86129972]
     [-0.52215719 -2.85837685 -0.89045009 ... -1.01795905 -1.21213078
      -0.16342622]
     [-0.12955836 -3.43814853  0.094599   ... -2.52779952 -4.71311826
      -1.97031286]]
    bias 4.099999999999993
    acc: 81.89%	
    
    • 结果说明

    可以看到使用5000条数据训练微博情感分类的感知器模型,在5000条测试数据集上面的准确率为81.89%,效果还是可以的。


参考

训练词向量:https://radimrehurek.com/gensim/models/word2vec.html

上一篇:AI实战:从入门到精通系列——用感知器实现情感分类(一)


转载:https://blog.csdn.net/zengNLP/article/details/101156539
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场