首页 > 其他 > 详细

逻辑回归

时间:2020-01-06 11:25:24      阅读:101      评论:0      收藏:0      [点我收藏+]

1. 回归就是计算回归系数,通过回归系数线性组合属性预测数值结果,回归以误差平方和最小为目的,其实是在假定误差服从高斯分布

2. 

LinearRegression:
根据特征和对应标签 生成回归模型

  

 

3. demo

#!/usr/bin/env python3
# -*- coding:utf8 -*-
"""
demo
"""
import warnings
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument
import pandas as pd
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import bsr_matrix
import numpy as np
warnings.filterwarnings(‘ignore‘)


class BagOfWords(object):

def __init__(self, vocab=False, tfidf=False, max_feature=1000):
lab_fea = None
if vocab:
print("select features...")
lab_fea = select_feature(‘../input/feature_chi.txt‘, max_feature)["1"]
self.vectorizer = None
if tfidf:
print("USE TfidfVectorizer", max_feature)
self.vectorizer = TfidfVectorizer(analyzer="word",
tokenizer=None,
preprocessor=None,
stop_words=None,
vocabulary=lab_fea,
max_features=max_feature)
else:
print("USE CountVectorizer", max_feature)
self.vectorizer = CountVectorizer(analyzer="word",
tokenizer=None,
preprocessor=None,
stop_words=None,
vocabulary=lab_fea,
max_features=max_feature)
self.lr = None

def train_lr(self, train_data, lab_data, c=1.0):
"""
训练lr
:param train_data:
:param lab_data:
:param c:
:return:
"""
print("train_data", len(train_data))
train_data_features = self.vectorizer.fit_transform(train_data)
# 生成稀疏矩阵bsr_matrix
train_data_features = bsr_matrix(train_data_features)
print("Training the logistic regression...")
self.lr = LogisticRegression(penalty=‘l2‘, dual=False, tol=0.0001, C=c, fit_intercept=True, intercept_scaling=1.0,
class_weight=None, random_state=None)
self.lr = self.lr.fit(train_data_features, lab_data)

def test_lr(self, test_data):
test_data_features = self.vectorizer.transform(test_data)
test_data_features = bsr_matrix(test_data_features)
result = self.lr.predict_proba(test_data_features)
return result

def validate_lr(self, train_data, lab_data, c=1.0):
train_data_features = self.vectorizer.fit_transform(train_data)
train_data_features = bsr_matrix(train_data_features) # 稀疏矩阵
lab_data = np.array(lab_data)
print("start k-fold validate...")
lr = LogisticRegression(penalty=‘l2‘, dual=False, tol=0.0001, C=c, fit_intercept=True, intercept_scaling=1.0, class_weight=None, random_state=None)
cv = np.mean(cross_val_score(lr, train_data_features, lab_data, cv=10, scoring=‘roc_auc‘))
return cv


def tag_reviews(reviews, prefix):
"""
taggededDocument = gensim.models.doc2vec.TaggedDocument
# 输入输出内容都为 词袋 + tag列表, 作用是记录每一篇文章的大致内容,并给该文章编号
:param reviews:
:param prefix:
:return:
"""
tagged = []
for i, review in enumerate(reviews):
tagged.append(TaggedDocument(words=review.split(), tags=[prefix + ‘_%s‘ % i]))
return tagged


def select_feature(filePath, k):
"""
选取特征词
:param filePath:
:param k:
:return:
"""
read = open(filePath, ‘r‘)
lab_fea = {}
for line in read:
line_arr = line.strip().split()
if len(line_arr) - 1 <= k:
lab_fea[line_arr[0]] = [kv.split(‘:‘)[0] for kv in line_arr[1:]]
else:
heap = []
val = globals()
key = globals()
heapq.heapify(heap)
for kv in line_arr[1:]:
if len(kv.split(":")) == 2:
key, val = kv.split(‘:‘)
if len(heap) < k:
heapq.heappush(heap, (float(val), key))
else:
if float(val) > heap[0][0]:
heapq.heappop(heap)
heapq.heappush(heap, (float(val), key))
lab_fea[line_arr[0]] = [heapq.heappop(heap)[1] for i in range(len(heap))]
read.close()
return lab_fea


def deal_data():
  # 数据处理
  pass


if __name__ == ‘__main__‘:
train, clean_train_reviews, clean_test_reviews, clean_unlabeled_reviews = deal_data()
# bow = BagOfWords(vocab=True, tfidf=True, max_feature=19000)
# bow.train_lr(clean_train_reviews, list(train["sentiment"][:10]), c=1)
# result = bow.test_lr(clean_test_reviews)

train_tagged = tag_reviews(clean_train_reviews, ‘TRAIN‘)
test_tagged = tag_reviews(clean_test_reviews, ‘TEST‘)
unlabeled_train_tagged = tag_reviews(clean_unlabeled_reviews, ‘UNTRAIN‘)
print(train_tagged)

逻辑回归

原文:https://www.cnblogs.com/xiennnnn/p/12155085.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!