1. 调参利用网格搜索 利用普遍的范围 去获得一个最优值
note:
sparse_result = tfidf_model.transform(document) # 得到tf-idf矩阵,稀疏矩阵表示法
print(sparse_result)
# (0, 3) 0.814802474667
# (0, 2) 0.579738671538
# (1, 2) 0.449436416524
# (1, 1) 0.631667201738
# (1, 0) 0.631667201738
print(sparse_result.todense()) # 转化为更直观的一般矩阵
# [[ 0. 0. 0.57973867 0.81480247]
# [ 0.6316672 0.6316672 0.44943642 0. ]]
print(tfidf_model.vocabulary_) # 词语与列的对应关系
# {‘have‘: 2, ‘pen‘: 3, ‘an‘: 0, ‘apple‘: 1}
"""
2. 个人认为 lightgbm 是一个轻量级的 xgboot
3. demo
# coding=utf-8
"""
lightgbm + tfidf 做自然语言处理 分类算法
机器学习的算法 调参利用网格搜索 利用普遍的范围 去获得一个最优值
note:
sparse_result = tfidf_model.transform(document) # 得到tf-idf矩阵,稀疏矩阵表示法
print(sparse_result)
# (0, 3) 0.814802474667
# (0, 2) 0.579738671538
# (1, 2) 0.449436416524
# (1, 1) 0.631667201738
# (1, 0) 0.631667201738
print(sparse_result.todense()) # 转化为更直观的一般矩阵
# [[ 0. 0. 0.57973867 0.81480247]
# [ 0.6316672 0.6316672 0.44943642 0. ]]
print(tfidf_model.vocabulary_) # 词语与列的对应关系
# {‘have‘: 2, ‘pen‘: 3, ‘an‘: 0, ‘apple‘: 1}
"""
from sklearn.externals import joblib
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
def load_data(csv_path):
"""
加载数据
:param csv_path:
:return: 分词语句的列表
"""
raw_data = pd.read_csv(csv_path, error_bad_lines=True)
documents = raw_data[""].values.tolist()
labels = raw_data[""].values.tolist()
x, y = list(), list()
将标签序列化
x, _ = words_df(words)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=33)
return x_train, x_test, y_train, y_test
def words_df(documents):
"""
获取tfidf文本 稀疏矩阵
:param words:
:return:
"""
# tfidf_model5 = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",
# vocabulary={"我": 0, "呀": 1, "!": 2}).fit(documents)
# 可定义token_pattern=r"(?u)\b\w+\b" stop_words vocabulary
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.05).fit(documents)
#word对应位置的字典
word_dict = vectorizer.vocabulary_
# 保存
# feature_path = ‘./model/feature.pkl‘
# with open(feature_path, ‘wb‘) as fw:
# pickle.dump(vectorizer.vocabulary_, fw)
# 加载
# tfidftransformer_path = ‘model/tfidftransformer.pkl‘
# with open(tfidftransformer_path, ‘wb‘) as fw:
# pickle.dump(tfidftransformer, fw)
# print(sparse_result.todense()) # 向量
tfidftransformer = vectorizer.transform(words) # 生成稀疏矩阵
print(tfidftransformer.shape)
# joblib.dump(vectorizer, "./model/df.model")
return tfidftransformer, word_dict
if __name__ == ‘__main__‘:
x_train, x_test, y_train, y_test = load_data("../dataset/_0.csv")
# 选择特征
clf_model = ExtraTreesClassifier(n_estimators=20, random_state=0)
clf_model.fit(x_train, y_train)
# word_list = [i[0] for i in sorted(word_dict.items(), key=lambda x: x[1], reverse=False)]
# 获取每个词特征的重要性 挑选出大于平均值的特征
importances = clf_model.feature_importances_
# print(dict(zip(word_list, importances)))
model = SelectFromModel(clf_model, threshold=‘1*mean‘, prefit=True)
# model.max_features
# joblib.dump(model, "importances.model")
# 更新训练数据
x_train, x_test = model.transform(x_train), model.transform(x_test)
# 生成字节流lgb数据 加快模型训练
lgb_train = lgb.Dataset(x_train, y_train)
lgb_val = (x_test, y_test)
# LGBMClassifier
model = lgb.LGBMClassifier(learning_rate=0.1, max_depth=4, n_estimators=18,
num_leaves=5, objective=‘multiclass‘, n_jobs=-1,
bagging_fraction=0.8, feature_fraction=0.8,
lambda_l1=0.1, lambda_l2=0.5)
lgb_model = model.fit(x_train, y_train, eval_set=lgb_val, early_stopping_rounds=1000)
# 保存模型
# joblib.dump(model, "model/lgb.model")
# 预测
y_preds = model.predict(x_test)
print(y_preds)
# 评估
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lgb_model, x_test, y_test, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
原文:https://www.cnblogs.com/xiennnnn/p/12155154.html