因为工作的关系,我写过许多个抓取网站信息的程序。
最简单的,只要用Python的urllib2.urlopen()函数就可以了;
然后,有个网站喜欢封人,所以,得找一批代理,轮流抓它的信息;
有的网站不允许程序抓取,所以,就得加入一些头信息;
有的网站需要登录,这时就要用到Cookies;
最后,为了提高效率,最好是使用多线程。(PS,有个地方要注意,urlopen这个函数,设定了一个全局对象opener,所以如果你使用了多个线程, 每个线程使用一个代理,那么,不能使用urlopen这个函数,而应该使用opener.open)
下面是我用Python写的一个抓代理的脚本,虽然现在已经不在教育网内部了,不过有时候还是需要用一下代理的:)
# -*- coding: cp936 -*-
import urllib2,re,thread,time
import socket
socket.setdefaulttimeout(10)
#-----------------------定义抓取代理的函数-------------------------------#
def getcnproxy(name):
pagenum=0
result=[]
getallpages=0
trycount=0
while getallpages==0 and trycount<=6:
pagenum=pagenum+1
url=‘http://www.proxycn.com/html_proxy/http-‘+str(pagenum)+‘.html‘
try:
html=urllib2.urlopen(url)
ip=‘‘
for line in html:
if ‘‘‘onDblClick="clip‘‘‘ in line:
proxy=line[line.find("clip(‘")+6:line.find("‘)")]
lock.acquire()
print name,proxy
lock.release()
result.append(proxy)
if ‘下一页|尾页‘ in line:
getallpages=1
except:
trycount=trycount+1
pagenum=pagenum-1
proxylist[0]=result
return result
def getproxycn(name):
pagenum=0
result=[]
getallpages=0
trycount=0
while pagenum<=9 and trycount<=2:
pagenum=pagenum+1
url=‘http://www.cnproxy.com/proxy‘+str(pagenum)+‘.html‘
try:
html=urllib2.urlopen(url)
for line in html:
if "HTTP" in line:
proxy=line[line.find(‘<td>‘)+4:line.find(‘̴‘)]+line[line.find(‘:‘):line.find(‘</td><td>‘)]
lock.acquire()
print name,proxy
lock.release()
result.append(proxy)
except:
trycount=trycount+1
pagenum=pagenum-1
proxylist[1]=result
return result
#------------------------- --------------- 结束代理抓取函数定义 --------------------------------------------------#
#------------------------------------------ 验证代理的函数定义 ---------------------------------------------------#
def proxycheckone(proxy):
url=‘http://www.facebook.com‘
proxy_url = ‘http://‘+proxy
proxy_support = urllib2.ProxyHandler({‘http‘: proxy_url})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
r=urllib2.Request(url)
r.add_header("Accept-Language","zh-cn") #加入头信息,这样可以避免403错误
r.add_header("Content-Type","text/html; charset=gb2312")
r.add_header("User-Agent","Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.1.4322)")
trycount=1
while trycount<=2:
try:
T0=time.time()
f=opener.open(r)
data=f.read()
if ‘Welcome to Facebook!‘ in data:
T=time.time()-T0
break
else:return []
except:
time.sleep(3)
trycount=trycount+1
if trycount>2:
return []
else:
return proxy+‘$‘+str(trycount)+‘#‘+str(T)
def proxycheck(idnum):
while 1:
r.acquire()
try:
i=proxylist[0]
del proxylist[0]
r.release()
except:
r.release()
x[idnum]=1
break
b=proxycheckone(i)
if len(b)>0:
a.acquire()
y.append(b)
a.release()
#---------------------------------------- 验证代理的函数定义结束 -------------------------------------------------#
#----------------------------- 抓取代理,抓取到的代理放在proxies.txt中,以\n分隔 --------------------------------#
#x=‘‘‘
lock=thread.allocate_lock()
proxylist=[[],[]]
thread.start_new(getcnproxy,(‘cnproxy‘,))
thread.start_new(getproxycn,(‘proxycn‘,))
while [] in proxylist:
time.sleep(30)
proxylist=proxylist[0]+proxylist[1]
w=open(‘proxies.txt‘,‘a‘)
w.write(‘\n‘.join(proxylist))
w.close()
del proxylist
print ‘get all proxies!\n\n‘
#‘‘‘
#----------------------------- 抓取代理完毕,抓取到的代理放在proxies.txt中,以\n分隔 -------------------------------#
#--------------------------------------------------- 验证代理 -----------------------------------------------------#
w=open(‘proxies.txt‘)
proxylist=list(set((re.sub(r‘(\t+[^\n]*\n|\n)‘,‘,‘,w.read())).split(‘,‘)))
while ‘‘ in proxylist:
del proxylist[proxylist.index(‘‘)]
w.close()
lock=thread.allocate_lock()
r=thread.allocate_lock()
a=thread.allocate_lock()
y=[]
x=[0]*120
for idnum in range(0,120):
thread.start_new(proxycheck,(idnum,))
while 0 in x:
print len(proxylist),sum(x),"left",len(y)
time.sleep(10)
w=open(‘proxies.txt‘,‘w‘)
w.write(re.sub(‘^\n‘,‘‘,re.sub(r‘\n+‘,‘\n‘,‘\n‘.join(y)+‘\n‘)))
w.close()
#-------------------------------------------------- 验证代理完毕 --------------------------------------------------#
原文:http://my.oschina.net/netmouse/blog/477642