在编写微博爬虫的过程中,免不了要进行模拟登录,因为新浪微博不登陆只能访问少量的微博信息。
然而,由于新浪微博的反爬虫功能在不断更新,例如改变了密码的加密算法(RSA),以前的一些模拟登陆方式已经不适用了。所以一开始试了好几种方法,均不能成功。后来受http://www.jb51.net/article/46053.htm启发,已经实现了。
目前,亲测能用的步骤是:①通过预登录,使用GET方法,获得登录所需的servertime, nonce, pubkey, rsakv;②使用encode64加密用户名,使用RSA算法加密密码;③登录。
步骤一:
response格式为(换行是我自己加上去的):
sinaSSOController.preloginCallBack({"retcode":0,
"servertime":1430743809,
"pcid":"gz-9e4ab7356f6545655af2a97e1cad7aa86446",
"nonce":"VA2IUS",
"pubkey":"EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443",
"rsakv":"1330428213",
"exectime":3})
python代码为:
prelogin_url = ‘http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)‘ % username response = urllib2.urlopen(prelogin_url) p = re.compile(r‘\((.*?)\)‘) #用来解析括号中的json格式内容 strurl = p.search(response.read()).group(1) dic = dict(eval(strurl)) #json格式的response pubkey = str(dic.get(‘pubkey‘)) servertime = str(dic.get(‘servertime‘)) nonce = str(dic.get(‘nonce‘)) rsakv = str(dic.get(‘rsakv‘))
步骤二:
加密用户名:
username_ = urllib.quote(username)
username = base64.encodestring(username_)[:-1]
加密密码:
rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥 message = servertime + ‘\t‘ + nonce + ‘\n‘ + password #拼接明文js加密文件中得到 passwd = rsa.encrypt(message, key) #加密 passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。
完整代码:
1 #! /usr/bin/env python 2 #coding=utf8 3 4 import urllib 5 import urllib2 6 import cookielib 7 import base64 8 import re 9 import json 10 import hashlib 11 import rsa 12 import binascii 13 14 cj = cookielib.LWPCookieJar() 15 cookie_support = urllib2.HTTPCookieProcessor(cj) 16 opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) 17 urllib2.install_opener(opener) 18 postdata = { 19 ‘entry‘: ‘weibo‘, 20 ‘gateway‘: ‘1‘, 21 ‘from‘: ‘‘, 22 ‘savestate‘: ‘7‘, 23 ‘userticket‘: ‘1‘, 24 ‘ssosimplelogin‘: ‘1‘, 25 ‘vsnf‘: ‘1‘, 26 ‘vsnval‘: ‘‘, 27 ‘su‘: ‘‘, 28 ‘service‘: ‘miniblog‘, 29 ‘servertime‘: ‘‘, 30 ‘nonce‘: ‘‘, 31 ‘pwencode‘: ‘rsa2‘, #加密算法 32 ‘sp‘: ‘‘, 33 ‘encoding‘: ‘UTF-8‘, 34 ‘prelt‘: ‘401‘, 35 ‘rsakv‘: ‘‘, 36 ‘url‘: ‘http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack‘, 37 ‘returntype‘: ‘META‘ 38 } 39 40 class WeiboLogin: 41 def __init__(self, username, password): 42 self.username = username 43 self.password = password 44 45 def __get_spwd(self): 46 rsaPublickey = int(self.pubkey, 16) 47 key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥 48 message = self.servertime + ‘\t‘ + self.nonce + ‘\n‘ + self.password #拼接明文js加密文件中得到 49 passwd = rsa.encrypt(message, key) #加密 50 passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制。 51 return passwd 52 53 def __get_suser(self): 54 username_ = urllib.quote(self.username) 55 username = base64.encodestring(username_)[:-1] 56 return username 57 58 def __prelogin(self): 59 prelogin_url = ‘http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)‘ % self.username 60 response = urllib2.urlopen(prelogin_url) 61 p = re.compile(r‘\((.*?)\)‘) 62 strurl = p.search(response.read()).group(1) 63 dic = dict(eval(strurl)) #json格式的response 64 self.pubkey = str(dic.get(‘pubkey‘)) 65 self.servertime = str(dic.get(‘servertime‘)) 66 self.nonce = str(dic.get(‘nonce‘)) 67 self.rsakv = str(dic.get(‘rsakv‘)) 68 69 def login(self): 70 url = ‘http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)‘ 71 try: 72 self.__prelogin() #预登录 73 except: 74 print ‘Prelogin Error‘ 75 return 76 global postdata 77 postdata[‘servertime‘] = self.servertime 78 postdata[‘nonce‘] = self.nonce 79 postdata[‘su‘] = self.__get_suser() 80 postdata[‘sp‘] = self.__get_spwd() 81 postdata[‘rsakv‘] = self.rsakv 82 postdata = urllib.urlencode(postdata) 83 headers = {‘User-Agent‘:‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:37.0) Gecko/20100101 Firefox/37.0‘} 84 req = urllib2.Request( 85 url = url, 86 data = postdata, 87 headers = headers 88 ) 89 result = urllib2.urlopen(req) 90 text = result.read() 91 p = re.compile(‘location\.replace\(\‘(.*?)\‘\)‘) 92 try: 93 login_url = p.search(text).group(1) 94 urllib2.urlopen(login_url) 95 print "Login Succeed!" 96 except: 97 print ‘Login Error!‘ 98 99 if __name__ == ‘__main__‘: 100 uid = ‘your username‘ 101 psw = ‘your password‘ 102 simLogin = WeiboLogin(uid, psw) 103 simLogin.login()
原文:http://www.cnblogs.com/xulf/p/4477691.html