python的requests模块是个神器,这里用request模块实现模拟登登陆:
#coding:utf-8 import sys import requests from bs4 import BeautifulSoup import re from pylsy import pylsytable #验证码识别# import os os.chdir("C:\Python27\Lib\site-packages") from pytesser import * #验证码识别的库 login_url = ‘http://mis.teach.ustc.edu.cn/userinit.do‘ a_url = ‘http://mis.teach.ustc.edu.cn/login.do‘ pre_url = ‘http://mis.teach.ustc.edu.cn/‘ grades_url = ‘http://mis.teach.ustc.edu.cn/querycjxx.do‘ headers = { ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36‘, ‘Referer‘: ‘http://mis.teach.ustc.edu.cn/userinit.do‘, ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘, ‘Accept-Encoding‘: ‘gzip, deflate‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en;q=0.6‘, ‘Connection‘: ‘keep-alive‘, } pre_data = {‘userbz‘: ‘s‘} login_data = { ‘userbz‘: ‘s‘, ‘hidjym‘: ‘‘, } grades_data = { ‘xuenian‘: ‘‘, ‘chaxun‘: ‘+%B2%E9++%D1%AF+‘, ‘px‘: ‘1‘, ‘zd‘: ‘0‘, } #提交post指令 def judging(name): divide = 125#阙值根据具体调试 list1 = [] for i in range(256): if i < divide: list1.append(0) else: list1.append(1) image = Image.open(name) image2 = image.convert(‘L‘) #根据阙值二值化分割 image_text = image2.point(table,‘1‘) return image_to_string(image_text) #识别率有待改进 def getGrades(filename): userid=raw_input("name:") password=raw_input("password:") s = requests.Session() login_r = s.post(login_url, headers=headers, data=pre_data) soup = BeautifulSoup(login_r.text,"html.parser") img_src = pre_url + soup.find(‘img‘, id=‘random‘)[‘src‘] f = open(‘c.png‘, ‘wb‘) img = s.get(img_src) f.write(img.content) f.close() code = judging(‘c.png‘) login_data[‘userCode‘]=userid login_data[‘passWord‘]=password login_data[‘check‘] = code li_r = s.post(a_url, headers=headers, data=login_data) grades = s.post(grades_url, headers=headers, data=grades_data) f = open(filename, ‘w‘) reload(sys) sys.setdefaultencoding(‘utf8‘) f.writelines(grades.text) f.close() def sousa(filename): f = open(filename) text = f.read() #html.parser soup=BeautifulSoup(text,"html.parser") trs=soup.find_all(‘tr‘,class_=‘bg‘) courseName=[] courseGrades=[] courseGPA=[] del trs[0] for course in trs: tds=course.find_all(‘td‘,class_=‘bg‘) courseName.append(tds[2].string) courseGrades.append(tds[4].string) courseGPA.append(tds[6].string) return (courseName,courseGrades,courseGPA) def writeGrades(filename): courseName,courseGrades,courseGPA=sousa() f=open(filename,‘w‘) for i in range(len(courseGPA)): f.write(‘%s %s %s \n‘ % (courseName[i],courseGrades[i],courseGPA[i])) f.close() if __name__ == ‘__main__‘: getGrades(‘test.txt‘) courseName,courseGrades,courseGPA=sousa(‘test.txt‘) attributes=[‘courseName‘,‘courseGrades‘,‘coursePoints‘] table=pylsytable(attributes) table.add_data(‘courseName‘,courseName) table.add_data(‘courseGrades‘,courseGrades) table.add_data(‘coursePoints‘,courseGPA) print table
利用requests.Session()并构造post指令,具体情况具体分析。
图像处理用到了PIL,pytesser库 ,pytesser调用的tesseract是谷歌的一个用于识别的开源框架,可用于数字、字母、汉字识别(需要优化)。
相关主要代码:
image = Image.open(name) image_text = image2.point(table,‘1‘) return image_to_string(image_text)
原文:http://www.cnblogs.com/fuzzer/p/5222848.html