主要内容:
- 读取文件内容以及所属的类别
- 提取合适于机器学习的特征向量
- 训练一个朴素贝叶斯模型来进行文本分类
- 预测新数据得到所属类别和所属类别概率
核心:
为了在文本文件中执行机器学习算法, 我们首先要做的是将文本内容转化成数值形式的特征向量。
文本的预处理, 分词以及过滤停用词都被包含在一个可以构建特征字典和将文档转换成特征向量的高级组件中
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> count_vect = CountVectorizer()
>>> X_train_counts = count_vect.fit_transform(twenty_train.data)
>>> X_train_counts.shape
(2257, 35788)
tf–idf 方法
>>> tfidf_transformer = TfidfTransformer()
>>> X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
>>> X_train_tfidf.shape
(2257, 35788)
训练分类器
>>> from sklearn.naive_bayes import MultinomialNB
>>> clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
预测新数据
为了尝试预测新文档所属的类别,我们需要使用和之前同样的步骤来抽取特征。 不同之处在于,我们在transformer调用 transform 而不是 fit_transform ,因为这些特征已经在训练集上进行拟合了。
>>> docs_new = ['God is love', 'OpenGL on the GPU is fast']
>>> X_new_counts = count_vect.transform(docs_new)
>>> X_new_tfidf = tfidf_transformer.transform(X_new_counts)
>>> predicted = clf.predict(X_new_tfidf)
朴素贝叶斯文本分类二分类例子:
# -*- encoding=utf-8 -*-
# 导入包
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import jieba
import sklearn
from sklearn.metrics import classification_report
import jieba.analyse
# 设置不以科学计数法输出
import numpy as np
np.set_printoptions(suppress=True)
# 载入自定义词典
jieba.load_userdict(r'F:\文本标签\文本反垃圾\恶意推广文本分类\第一类赛事主体标签.txt')
jieba.load_userdict(r'F:\文本标签\文本反垃圾\恶意推广文本分类\第二类网络主体标签.txt')
# 载入自定义停止词
jieba.analyse.set_stop_words(r'F:\文本标签\文本反垃圾\恶意推广文本分类\stopwords.txt')
# 数据预处理
def data_preprocessing(df):
"""
:param df: 数据集,类型为数据框
:return: 返回切词后的数据集
"""
# 中文分词
df['text'] = df.content.apply(lambda x: " ".join(jieba.cut(x)))
df = df[['text', 'type']]
news_data = df.iloc[:, 0]
news_target = df.iloc[:, 1]
return news_data,news_target
# 拆分训练数据集和测试集
def data_spilt_train_test(news_data,news_target):
"""
:param news_data: 分词后的数据集
:param news_target: 目标变量
:return: 拆分后的训练集和测试集
"""
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(news_data, news_target, test_size=0.25,random_state=1028)
return x_train, x_test, y_train, y_test
# 模型训练
def model_train(x_train, x_test, y_train, y_test):
# 进行tfidf特征抽取
tf = TfidfVectorizer()
x_train = tf.fit_transform(x_train)
x_test = tf.transform(x_test)
# 通过朴素贝叶斯进行预测(拉普拉斯平滑系数为设置为1)
mlb = MultinomialNB( alpha=1)
mlb.fit(x_train, y_train)
# 训练集上的评测结果
accuracy, auc = evaluate(mlb, x_train, y_train)
print("训练集正确率:%.4f%%\n" % (accuracy * 100))
print("训练集AUC值:%.6f\n" % (auc))
# 测试集上的评测结果
accuracy, auc = evaluate(mlb, x_test, y_test)
print("测试集正确率:%.4f%%\n" % (accuracy * 100))
print("测试AUC值:%.6f\n" % (auc))
y_predict = mlb.predict(x_test)
print(classification_report(y_test, y_predict, target_names=['0', '1']))
return mlb,tf
# 模型评估
def evaluate(model, X, y):
"""评估数据集,并返回评估结果,包括:正确率、AUC值
"""
accuracy = model.score(X, y)
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y, model.predict_proba(X)[:, 1], pos_label=1)
return accuracy, sklearn.metrics.auc(fpr, tpr)
# 模型预测
def model_predict(text,model,tf):
"""
:param text: 单个文本
:param model: 朴素贝叶斯模型
:param tf: 向量器
:return: 返回预测概率和预测类别
"""
text1=[" ".join(jieba.cut(text))]
# 进行tfidf特征抽取
text2 = tf.transform(text1)
predict_type = model.predict(text2)[0]
predict_prob = model.predict_proba(text2)
prob_0 = predict_prob[0][0]
prob_1 = predict_prob[0][1]
if predict_type == 1:
result_prob = round(prob_1, 3)
else:
result_prob = round(prob_0, 3)
return predict_type, result_prob
if __name__ == '__main__':
# 读取数据
df = pd.read_table('F:/1.6期反垃圾改版/1.7期反垃圾升级/恶意推广/恶意推广训练文本.txt', sep='\t')
news_data, news_target=data_preprocessing(df)
x_train, x_test, y_train, y_test=data_spilt_train_test(news_data,news_target)
mlb,tf=model_train(x_train, x_test, y_train, y_test)
# 预测单文本
text="既然选择相信,那就等他涅槃重生吧"
predict_type, predict_prob=model_predict(text,mlb,tf)
print(predict_type)
print(predict_prob)
运行效果:
E:\laidefa\python.exe F:/文本标签/文本反垃圾/恶意推广文本分类/sklearn朴素贝叶斯分类模型.py
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xiaohu\AppData\Local\Temp\jieba.cache
Loading model cost 0.864 seconds.
Prefix dict has been built succesfully.
训练集正确率:91.8158%
训练集AUC值:0.984676
测试集正确率:85.6120%
测试AUC值:0.954850
precision recall f1-score support
0 0.82 0.92 0.87 711
1 0.91 0.79 0.84 686
micro avg 0.86 0.86 0.86 1397
macro avg 0.86 0.85 0.86 1397
weighted avg 0.86 0.86 0.86 1397
0
0.501
Process finished with exit code 0