from urllib import request, error, response, parse from bs4 import * import http.cookiejar import html,json import hashlib def mydecode(data): ‘‘‘将二进制数据转码成字符 ,万一出错 返回None‘‘‘ types = [‘utf-8‘,‘gb2312‘,‘gbk‘,‘iso-8859-1‘] #可以添加其他字符编码 for type in types: try: return data.decode(type) except: pass return None def openRequest(url, method=‘GET‘, data=None): global cj, opener, init if not init: init = True cj = http.cookiejar.CookieJar() opener = request.build_opener(request.HTTPCookieProcessor(cj)) req = request.Request(url) req.add_header(‘User-agent‘, ‘Mozilla/27.0‘) request.install_opener(opener) if data: return request.urlopen(req,data) else: return request.urlopen(req) def GET(url): ‘‘‘get请求页面 并返回已经解码的url对应的页面‘‘‘ r = openRequest(url) return mydecode(r.read()) def POST(url, data, head=[]): ‘‘‘post数据 并返回已经解码的url对应的页面‘‘‘ r = request.Request(url) for k, v in head: r.add_header(k, v) resp = openRequest(url, ‘POST‘, data) return mydecode(resp.read()) def out(data): ‘‘‘输出调试信息‘‘‘ if debug: print(‘-‘*20) print(data) def login(username, password, school): ‘‘‘登录获取session以发送数据,并得到课程的结果页‘‘‘ url = ‘http://passport2.chaoxing.com/login‘ data = { ‘pid‘: -1, ‘pidName‘: ‘‘, ‘fid‘: str(school), ‘fidName‘: ‘‘, ‘uname‘: str(username), ‘password‘: str(password) } h = [("Content-Type", "application/x-www-form-urlencoded;charset=utf-8")] data = parse.urlencode(data).encode(‘utf-8‘) return POST(url, data, head=h) def urltoMap(url): #url = input(‘please input:‘) p = parse.urlparse(url).query map = {} for key_value in p.split(‘&‘): k,v = key_value.split(‘=‘) map[k] = v return map def getChapterList(doc): ‘‘‘获取一门课程对应的所有章节列表‘‘‘ soup = BeautifulSoup(doc) l = [] for h3 in soup.find_all(‘h3‘, class_=‘clearfix‘): url = h3.a[‘href‘] name = h3.find(‘span‘, class_=‘articlename‘).a[‘title‘] passed = ‘100%‘ in h3.a.span.text l.append( (url, name, passed)) return l def getCourseList(doc): ‘‘‘获取用户选择的课程列表‘‘‘ soup = BeautifulSoup(doc) childsrc = soup.find(‘iframe‘)[‘src‘] #因为课程在iframe里面 显示的 childsrc = ‘http://ptr.chaoxing.com/visit/courses?template=1‘ #debug doc = GET(childsrc) soup = BeautifulSoup(doc) courses = [] for div in soup.find_all(‘div‘, class_=‘Mconright‘): url = div.h3.a[‘href‘] url = parse.urljoin(childsrc, url) name = div.h3.a.string courses.append(url) return courses def passAChapter(url): ‘‘‘跳过一个章节‘‘‘ map = urltoMap(url) url = ‘http://mooc.chaoxing.com/knowledge/cards‘ data = { ‘clazzid‘: map[‘classId‘], ‘courseid‘: map[‘courseId‘], ‘knowledgeid‘: map[‘chapterId‘], ‘num‘: 0, ‘v‘: ‘20140815‘, } url += "?"+parse.urlencode(data) doc = GET(url) #在页面的js中抽取有用的信息 soup = BeautifulSoup(doc) script = str(soup.find_all(‘script‘)[4]) start = script.find(‘try{‘) end = script.find(‘};‘) substr = script[start+13:end+1] d = jsonTodict( substr) objid = d[‘attachments‘][0][‘objectId‘] t = getDuration(objid) s1 = ‘>.MY[Or/s<?OJC]‘ s2 = str((t-1)*1000) m = hashlib.md5() m.update((s1+s2).encode(‘utf-8‘)) enc = m.hexdigest() data = { ‘clazzId‘: d[‘defaults‘][‘clazzId‘], ‘jobid‘: d[‘attachments‘][0][‘jobid‘], ‘objectId‘: objid, ‘otherInfo‘: d[‘attachments‘][0][‘otherInfo‘], ‘rt‘: 0.9, ‘dtype‘: ‘Video‘, ‘enc‘: enc, # md5(solt+time*1000) ‘clipTime‘: (‘0_%d‘ %t), #0_maxsec ‘duration‘: t, #sec ‘playingTime‘: t-1, #secnow ‘isdrag‘: 3, } p = parse.urlencode(data) url = ‘http://ptr.chaoxing.com/multimedia/log?‘+p doc = GET(url) return ‘true‘ in doc def getDuration(objid): ‘‘‘获取objid对应的视频有多少秒 返回int‘‘‘ url = "http://ptr.chaoxing.com/ananas/status/"+str(objid) doc = GET(url) i1 = doc.find(‘"duration"‘) i1 = doc.find(‘:‘, i1) i2 = doc.find(‘,‘, i1) t = int(doc[i1+1:i2]) return t def jsonTodict(jsontext): d=json.JSONDecoder().decode(jsontext) return d def main(): user = ‘账号‘ pswd = ‘密码‘ school = ‘首页获取的学校id‘ doc = login(user, pswd, school) courselist = getCourseList(doc) allclassurl = [] #保存所有的小节 for course in courselist: #遍历每一个课程 doc = GET(course) chptlist = getChapterList(doc) allclassurl+=chptlist print(‘get classurl finished‘) for url,name,passed in allclassurl: if not passed: #如果没有完成 passed = passAChapter(url) print( ‘%s:\t\t%s‘ %(name, passed)) if __name__=="__main__": global debug, init debug=True init = False main()
原文:http://my.oschina.net/cuilili/blog/382278