1 #encoding:utf-8 2 from numpy import * 3 4 def loadDataSet(): #加载数据 5 dataMat = []; 6 labelMat = [] 7 fr = open(‘testSet.txt‘) 8 for line in fr.readlines(): 9 lineArr = line.strip().split() 10 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) 11 labelMat.append(int(lineArr[2])) 12 return dataMat, labelMat 13 14 15 def sigmoid(inX): #得到sigmoid函数值 16 return 1.0 / (1 + exp(-inX)) 17 18 19 def gradAscent(dataMatIn, classLabels): 20 dataMatrix = mat(dataMatIn) # 转化为numpy矩阵 21 labelMat = mat(classLabels).transpose() # 转化为numpy矩阵,并转置 22 m, n = shape(dataMatrix) 23 alpha = 0.001 24 maxCycles = 500 25 weights = ones((n, 1)) 26 for k in range(maxCycles): # 迭代maxCycles次 梯度上升算法 27 h = sigmoid(dataMatrix * weights) 28 error = (labelMat - h) 29 weights = weights + alpha * dataMatrix.transpose() * error # 为什么这么做?参考附件,或者http://download.csdn.net/detail/lewsn2008/6547463,总结的非常好 30 return weights 31 32 33 def plotBestFit(weights): #画出数据集和最佳拟合曲线 34 import matplotlib.pyplot as plt 35 dataMat, labelMat = loadDataSet() 36 dataArr = array(dataMat) 37 #weights = weights.getA() 38 n = shape(dataArr)[0] 39 xcord1 = []; 40 ycord1 = [] 41 xcord2 = []; 42 ycord2 = [] 43 for i in range(n): 44 if int(labelMat[i]) == 1: 45 xcord1.append(dataArr[i, 1]); 46 ycord1.append(dataArr[i, 2]) 47 else: 48 xcord2.append(dataArr[i, 1]); 49 ycord2.append(dataArr[i, 2]) 50 fig = plt.figure() 51 ax = fig.add_subplot(111) 52 ax.scatter(xcord1, ycord1, s=30, c=‘red‘, marker=‘s‘) 53 ax.scatter(xcord2, ycord2, s=30, c=‘green‘) 54 x = arange(-3.0, 3.0, 0.1) 55 y = (-weights[0] - weights[1] * x) / weights[2] 56 ax.plot(x, y) 57 plt.xlabel(‘X1‘); 58 plt.ylabel(‘X2‘); 59 plt.show() 60 61 # 梯度上升算法在每次更新回归系数时都需要遍历整个数据集, 62 # 该方法在处理100个左右的数据集尚可,但如果数据量增大,那该方法的计算量就太大了, 63 # 有一种改进方法是一次仅用一个样本点来更新回归系数,该方法称为随机梯度上升算法, 64 # 由于可以在新样本到来时对分类器进行增量式更新,因而随机梯度上升算法是一个在线学习算法。 65 def stocGradAscent0(dataMatrix, classLabels): #随机梯度上升算法 66 m, n = shape(dataMatrix) 67 alpha = 0.01 68 weights = ones(n) # initialize to all ones 69 for i in range(m): 70 h = sigmoid(sum(dataMatrix[i] * weights)) 71 error = classLabels[i] - h 72 weights = weights + alpha * error * dataMatrix[i] 73 return weights 74 75 def useStocGradAscent0(): #测试随机梯度上升算法 76 dataMat, labelMat = loadDataSet() 77 weights = stocGradAscent0(array(dataMat), labelMat) 78 plotBestFit(weights) 79 80 def useStocGradAscent1(): #测试改进的随机梯度上升算法 81 dataMat, labelMat = loadDataSet() 82 weights = stocGradAscent1(array(dataMat), labelMat) 83 plotBestFit(weights) 84 85 def stocGradAscent1(dataMatrix, classLabels, numIter=150): #改进的随机梯度上升算法 86 m, n = shape(dataMatrix) 87 weights = ones(n) 88 for j in range(numIter): 89 dataIndex = range(m) 90 for i in range(m): 91 alpha = 4 / (1.0 + j + i) + 0.0001 # alpha每次调整 92 randIndex = int(random.uniform(0, len(dataIndex))) # 随机选取更新 93 h = sigmoid(sum(dataMatrix[randIndex] * weights)) 94 error = classLabels[randIndex] - h 95 weights = weights + alpha * error * dataMatrix[randIndex] 96 del (dataIndex[randIndex]) 97 return weights 98 99 100 def classifyVector(inX, weights): #得到类别 101 prob = sigmoid(sum(inX * weights)) 102 if prob > 0.5: 103 return 1.0 104 else: 105 return 0.0 106 107 108 def colicTest(): 109 frTrain = open(‘horseColicTraining.txt‘); #读取文件 110 frTest = open(‘horseColicTest.txt‘) 111 trainingSet = []; 112 trainingLabels = [] 113 for line in frTrain.readlines(): 114 currLine = line.strip().split(‘\t‘) 115 lineArr = [] 116 for i in range(21): 117 lineArr.append(float(currLine[i])) 118 trainingSet.append(lineArr) 119 trainingLabels.append(float(currLine[21])) 120 trainWeights = stocGradAscent1(array(trainingSet), trainingLabels, 1000) #用改进的随机梯度上升法训练 121 errorCount = 0; 122 numTestVec = 0.0 123 for line in frTest.readlines(): 124 numTestVec += 1.0 125 currLine = line.strip().split(‘\t‘) 126 lineArr = [] 127 for i in range(21): 128 lineArr.append(float(currLine[i])) 129 if int(classifyVector(array(lineArr), trainWeights)) != int(currLine[21]): #对测试集分类,并判断是否正确 130 errorCount += 1 131 errorRate = (float(errorCount) / numTestVec) 132 print "the error rate of this test is: %f" % errorRate 133 return errorRate 134 135 136 def multiTest(): #测试 137 numTests = 10; 138 errorSum = 0.0 139 for k in range(numTests): 140 errorSum += colicTest() 141 print "after %d iterations the average error rate is: %f" % (numTests, errorSum / float(numTests)) 142 143 if __name__ == ‘__main__‘: 144 # dataMat, labelMat = loadDataSet() 145 # plotBestFit(gradAscent(dataMat, labelMat).getA()) 146 #useStocGradAscent1() 147 multiTest()
附件:http://files.cnblogs.com/files/yzwhykd/Logistic%E5%9B%9E%E5%BD%92%E6%80%BB%E7%BB%93.pdf
原文:http://www.cnblogs.com/yzwhykd/p/6260350.html