向后选择法是一种用于处理多元线性回归问题的变量选择方法
首先要设定一个阈值,就是我们所期待的模型的准确度
每次去除和原方程相关度最低的变量,直到所有的变量都满足对应的阈值
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import statsmodels.api as sm def backwardElimination(x_train,flo): tmp=np.zeros((17290,17)) for i in range(0,17): regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit() maxn=max(regressor_OLS.pvalues) adj_b=regressor_OLS.rsquared_adj if maxn>flo: for j in range(0,17-i): if regressor_OLS.pvalues[i]==maxn: tmp[:,j]=x_train[:,j] x_train=np.delete(x_train,j,axis=1) regressor_OLS=sm.OLS(endog=y_train,exog=x_train).fit() adj_a=regressor_OLS.rsquared_adj if adj_a<=adj_b: print(regressor_OLS.summary()) return np.delete(np.hstack((x_train,tmp[:,[0,j]])),j,1) else : continue print(regressor_OLS.summary()) return x_train data=pd.read_csv(‘house_data.csv‘) print(data.shape) x_data=data.iloc[:,2:].values print(x_data.shape) y_data=data[‘price‘].values class_names=np.array([‘sqft_living‘,‘grade‘,‘sqft_above‘]) x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.2) clf=LinearRegression() clf.fit(x_train,y_train) print(x_train.shape) x_train=backwardElimination(x_train,0.05) print(x_train.shape) y_pred=clf.predict(x_test) print(y_pred) print(y_test) print(clf.score(x_test,y_test))
原文:https://www.cnblogs.com/gcyyzf/p/13584751.html