lightgbm + tfidf 做自然语言处理分类算法

时间：2020-01-06 11:17:53 阅读：209 评论：0 收藏：0 [点我收藏+]

1. 调参利用网格搜索利用普遍的范围去获得一个最优值

note：
    sparse_result = tfidf_model.transform(document)  # 得到tf-idf矩阵，稀疏矩阵表示法
    print(sparse_result)
    # (0, 3)   0.814802474667
    # (0, 2)   0.579738671538
    # (1, 2)   0.449436416524
    # (1, 1)   0.631667201738
    # (1, 0)   0.631667201738
    print(sparse_result.todense())  # 转化为更直观的一般矩阵
    # [[ 0.          0.          0.57973867  0.81480247]
    #  [ 0.6316672   0.6316672   0.44943642  0.        ]]
    print(tfidf_model.vocabulary_)  # 词语与列的对应关系
    # {‘have‘: 2, ‘pen‘: 3, ‘an‘: 0, ‘apple‘: 1}
"""

2. 个人认为 lightgbm 是一个轻量级的 xgboot

3. demo

# coding=utf-8
"""
lightgbm + tfidf 做自然语言处理 分类算法
机器学习的算法 调参利用网格搜索 利用普遍的范围 去获得一个最优值

note：
    sparse_result = tfidf_model.transform(document)  # 得到tf-idf矩阵，稀疏矩阵表示法
    print(sparse_result)
    # (0, 3)   0.814802474667
    # (0, 2)   0.579738671538
    # (1, 2)   0.449436416524
    # (1, 1)   0.631667201738
    # (1, 0)   0.631667201738
    print(sparse_result.todense())  # 转化为更直观的一般矩阵
    # [[ 0.          0.          0.57973867  0.81480247]
    #  [ 0.6316672   0.6316672   0.44943642  0.        ]]
    print(tfidf_model.vocabulary_)  # 词语与列的对应关系
    # {‘have‘: 2, ‘pen‘: 3, ‘an‘: 0, ‘apple‘: 1}
"""
from sklearn.externals import joblib
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def load_data(csv_path):
    """
    加载数据
    :param csv_path:
    :return: 分词语句的列表
    """
    raw_data = pd.read_csv(csv_path, error_bad_lines=True)
    documents = raw_data[""].values.tolist()
    labels = raw_data[""].values.tolist()
    x, y = list(), list()
    将标签序列化
    x, _ = words_df(words)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=33)
    return x_train, x_test, y_train, y_test


def words_df(documents):
    """
    获取tfidf文本 稀疏矩阵
    :param words:
    :return:
    """
    # tfidf_model5 = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b",
    #                                vocabulary={"我": 0, "呀": 1, "!": 2}).fit(documents)
    # 可定义token_pattern=r"(?u)\b\w+\b" stop_words vocabulary
    vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.05).fit(documents)
    #word对应位置的字典
　　 word_dict = vectorizer.vocabulary_
    # 保存
    # feature_path = ‘./model/feature.pkl‘
    # with open(feature_path, ‘wb‘) as fw:
    #     pickle.dump(vectorizer.vocabulary_, fw)
    # 加载
    # tfidftransformer_path = ‘model/tfidftransformer.pkl‘
    # with open(tfidftransformer_path, ‘wb‘) as fw:
    #     pickle.dump(tfidftransformer, fw)
    # print(sparse_result.todense())  # 向量
    tfidftransformer = vectorizer.transform(words)  # 生成稀疏矩阵
    print(tfidftransformer.shape)
    # joblib.dump(vectorizer, "./model/df.model")
    return tfidftransformer, word_dict


if __name__ == ‘__main__‘:
    x_train, x_test, y_train, y_test = load_data("../dataset/_0.csv")
　　 # 选择特征
    clf_model = ExtraTreesClassifier(n_estimators=20, random_state=0)
    clf_model.fit(x_train, y_train)
    # word_list = [i[0] for i in sorted(word_dict.items(), key=lambda x: x[1], reverse=False)]

    # 获取每个词特征的重要性 挑选出大于平均值的特征
    importances = clf_model.feature_importances_
    # print(dict(zip(word_list, importances)))
    model = SelectFromModel(clf_model, threshold=‘1*mean‘, prefit=True)
    # model.max_features
    # joblib.dump(model, "importances.model")

    # 更新训练数据
    x_train, x_test = model.transform(x_train), model.transform(x_test)
    # 生成字节流lgb数据 加快模型训练
　　 lgb_train = lgb.Dataset(x_train, y_train)
    lgb_val = (x_test, y_test)

    # LGBMClassifier
    model = lgb.LGBMClassifier(learning_rate=0.1, max_depth=4, n_estimators=18,
                               num_leaves=5, objective=‘multiclass‘, n_jobs=-1,
                               bagging_fraction=0.8, feature_fraction=0.8,
                               lambda_l1=0.1, lambda_l2=0.5)
    lgb_model = model.fit(x_train, y_train, eval_set=lgb_val, early_stopping_rounds=1000)
    # 保存模型
    # joblib.dump(model, "model/lgb.model")

    # 预测
    y_preds = model.predict(x_test)
    print(y_preds)

    # 评估
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(lgb_model, x_test, y_test, cv=5)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

原文：https://www.cnblogs.com/xiennnnn/p/12155154.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年09月23日 (328)
2021年09月24日 (313)
2021年09月17日 (191)
2021年09月15日 (369)
2021年09月16日 (411)
2021年09月13日 (439)
2021年09月11日 (398)
2021年09月12日 (393)
2021年09月10日 (160)
2021年09月08日 (222)

lightgbm + tfidf 做自然语言处理 分类算法

lightgbm + tfidf 做自然语言处理分类算法