向后选择法是一种用于处理多元线性回归问题的变量选择方法
首先要设定一个阈值,就是我们所期待的模型的准确度
每次去除和原方程相关度最低的变量,直到所有的变量都满足对应的阈值
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
def backwardElimination(x_train,flo):
tmp=np.zeros((17290,17))
for i in range(0,17):
regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit()
maxn=max(regressor_OLS.pvalues)
adj_b=regressor_OLS.rsquared_adj
if maxn>flo:
for j in range(0,17-i):
if regressor_OLS.pvalues[i]==maxn:
tmp[:,j]=x_train[:,j]
x_train=np.delete(x_train,j,axis=1)
regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit()
adj_a=regressor_OLS.rsquared_adj
if adj_a<=adj_b:
print(regressor_OLS.summary())
return np.delete(np.hstack((x_train,tmp[:,[0,j]])),j,1)
else :
continue
print(regressor_OLS.summary())
return x_train
data=pd.read_csv(‘house_data.csv‘)
print(data.shape)
x_data=data.iloc[:,2:].values
print(x_data.shape)
y_data=data[‘price‘].values
class_names=np.array([‘sqft_living‘,‘grade‘,‘sqft_above‘])
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2)
clf=LinearRegression()
clf.fit(x_train,y_train)
print(x_train.shape)
x_train=backwardElimination(x_train,0.05)
print(x_train.shape)
y_pred=clf.predict(x_test)
print(y_pred)
print(y_test)
print(clf.score(x_test,y_test))
原文:https://www.cnblogs.com/gcyyzf/p/13584751.html