小言_互联网的博客

python基于SVM的疫情评论情感数据分析

463人阅读  评论(0)

1、构建SVM情感分析模型

读取数据

使用pandas的库读取微薄数据读取并使进行数据打乱操作

import pandas as pd
test = pd.read_csv(".\\weibo.csv")
test_data = pd.DataFrame(test)[:1000]
test_data

打乱数据 

re_test_data = test_data.sample(frac=1).reset_index(drop=True)

分词处理 

对处理后的数据进行分词处理这里我们使用python的jieba库

import jieba_fast as jieba
import re
# 使用jieba进行分词
def chinese_word_cut(mytext):
    # 去除[@用户]避免影响后期预测精度  
    mytext = re.sub(r'@\w+','',mytext)
    # 去除数字字母的字符串
    mytext = re.sub(r'[a-zA-Z0-9]','',mytext)
    return " ".join(jieba.cut(mytext))
# apply的方法是将数据着行处理
re_test_data['cut_review'] = re_test_data.review.apply(chinese_word_cut) 

停用词处理 

import re
# 获取停用词列表
def get_custom_stopwords(stop_words_file):
    with open(stop_words_file,encoding='utf-8') as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list
cachedStopWords = get_custom_stopwords(".\\stopwords.txt") 

数据分割 

分词后我们对数据进行训练数据分分割处理

X = re_test_data['remove_strop_word']
y = re_test_data.label
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11) 

使用TFIDF和朴素贝叶斯训练数据 

%%time
# 加载模型及保存模型
from sklearn.externals import joblib
# 朴素贝叶斯算法
from sklearn.naive_bayes import MultinomialNB
# TFIDF模型
from sklearn.feature_extraction.text import TfidfVectorizer
# 管道模型可将两个算法进行连接
from sklearn.pipeline import Pipeline
# 将TFIDF模型和朴素贝叶斯算法连接
TFIDF_NB_Sentiment_Model = Pipeline([
    ('TFIDF', TfidfVectorizer()),
    ('NB', MultinomialNB())
])
# 取三万条数据进行训练
nbm = TFIDF_NB_Sentiment_Model.fit(X_train[:80000],y_train[:80000])
nb_train_score = TFIDF_NB_Sentiment_Model.score(X_test,y_test)
joblib.dump(TFIDF_NB_Sentiment_Model, 'tfidf_nb_sentiment.model')
print(nb_train_score) 

使用TFIDF和SVM训练数据 

%%time
from sklearn.svm import SVC

TFIDF_SVM_Sentiment_Model = Pipeline([
    ('TFIDF', TfidfVectorizer()),
    ('SVM', SVC(C=0.95,kernel="linear",probability=True))
])
TFIDF_SVM_Sentiment_Model.fit(X_train[:30000],y_train[:30000])
svm_test_score = TFIDF_SVM_Sentiment_Model.score(X_test,y_test)
joblib.dump(TFIDF_SVM_Sentiment_Model, 'tfidf_svm1_sentiment.model')

 

模型预测 

# model = joblib.load('tfidf_svm1_sentiment.model')
model = joblib.load('tfidf_nb_sentiment.model')
# 获取停用词列表   
cachedStopWords = get_custom_stopwords(".\\stopwords.txt")
# 判断句子消极还是积极
def IsPoOrNeg(text):
    # 加载训练好的模型     
#     model = joblib.load('tfidf_nb_sentiment.model')
    
    # 去除停用词    
    text = remove_stropwords(text,cachedStopWords)
    # jieba分词         
    seg_list = jieba.cut(text, cut_all=False)
    text = " ".join(seg_list)
    # 否定不处理
    text = Jieba_Intensify(text)
#     y_pre =model.predict([text])
    proba = model.predict_proba([text])[0]
    if proba[1]>0.4:
        print(text,":此话极大可能是积极情绪(概率:)"+str(proba[1]))
        return "积极"
    else:
        print(text,":此话极大可能是消极情绪(概率:)"+str(proba[0]))
        return "消极"

IsPoOrNeg("什么玩意 不好 不开心")

 预测结果如下:

 对疫情评论数据进行处理

import pandas as pd

# 去除停用词和特殊字符
def review_process(text):
    return text.replace("🙏","")
# 读取csv的数据并取评论数据集
weibo = pd.read_csv("./Datashuju.csv",header=None)
weibo = pd.DataFrame(weibo[1])
# 去除特殊字符
weibo[1]= weibo[1].apply(review_process)
# 清除空行数据
weibo = weibo.dropna()

疫情评论词词云图

# pip  install  wordcloud
#生成词云
import wordcloud
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 将数组转化为字符串
word_show = ' '.join(weibo[1])
w = wordcloud.WordCloud(font_path="msyh.ttc", width=1000, height= 700,background_color="white", max_words=100) 
# 传入功能主治的字符串生成词云图
w.generate(word_show)
w.to_file("hot_word.jpg")

plt.figure(figsize=(8,8.5))
plt.imshow(w, interpolation='bilinear')
plt.axis('off')
plt.title('评论内容词云图', fontsize=30)
plt.show()

情感统计  

weibo[2] = None
weibo[2] =  weibo[1].apply(IsPoOrNeg)
weibo

疫情微薄评论情感统计图

lable = list(dict(weibo[2].value_counts()).keys())
value = list(weibo[2].value_counts())
explode=[0.01,0.01]
plt.figure(figsize=(6, 6))
plt.pie(value,explode=explode,labels=lable,autopct='%1.1f%%')#绘制饼图
plt.title('疫情微博评论情感统计')
plt.show() 

 

 

 

 


转载:https://blog.csdn.net/m0_43432638/article/details/128265327
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场