小言_互联网的博客

5 文本主题与分类之中文文本分类 --- 机器学习 --- 小白

569人阅读  评论(0)

作者:Irain
QQ:2573396010
微信:18802080892
Github文件:5 文本主题与分类之中文文本分类
视频链接:文本主题与分类之中文文本分类

1 多项式朴素贝叶斯

1.1载入中文文本

import jieba
import pandas as pd
df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()

df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()

df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()

technology = df_technology.content.values.tolist()[1000:11000]
car = df_car.content.values.tolist()[1000:11000]
entertainment = df_entertainment.content.values.tolist()[:10000]

1.2 载入停用词

stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values

1.3 分词并清洗中文文本

def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append((" ".join(segs), category))
        except :
            print (line)
            continue 

#生成训练数据
sentences = []

preprocess_text(technology, sentences, 'technology')
preprocess_text(car, sentences, 'car')
preprocess_text(entertainment, sentences, 'entertainment')

1.4 原始数据分成训练集合测试集

import random
random.shuffle(sentences) # 打乱数据顺序
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

1.5 引用分类器,训练并测试

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
classifier.score(vec.transform(x_test), y_test)

2 文本分类器

2.1 搭建朴素贝叶斯分类器类class

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

class TextClassifier():
     def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,4), max_features=20000)

    def features(self, X): # 数据标准化
        return self.vectorizer.transform(X)

    def fit(self, X, y):  ## 数据训练
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):  #  估测 文本是哪种主题
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):  #  检测测试集
        return self.classifier.score(self.features(X), y)

2.2 估测文本主题和数据测试集

text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.predict('这 是 有史以来 最 大 的 一 次 法拉利'))
print(text_classifier.score(x_test, y_test))

3 TF-IDF分类器

import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


class TextClassifier():

    def __init__(self, classifier=SVC(kernel='linear')):
        self.classifier = classifier
        self.vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=12000)

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.predict('这 是 有史以来 最 大 的 一 次 法拉利'))
print(text_classifier.score(x_test, y_test))

发布:


转载:https://blog.csdn.net/weixin_42122125/article/details/105826199
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场