首页 > 编程语言 > 详细

python爬虫批量抓取ip代理

时间:2019-03-14 20:22:55      阅读:4025      评论:0      收藏:0      [点我收藏+]

使用爬虫抓取数据时,经常要用到多个ip代理,防止单个ip访问太过频繁被封禁。
ip代理可以从这个网站获取:http://www.xicidaili.com/nn/。
因此写一个python程序来获取ip代理,保存到本地。
python版本:3.6.3

 1 #grab ip proxies from xicidaili
 2 import sys, time, re, requests
 3 from multiprocessing.dummy import Pool as ThreadPool
 4 from lxml import etree
 5 
 6 IP_POOL = ip_pool.py
 7 URL = http://www.xicidaili.com/nn/ #IP代理 高匿
 8 #URL = ‘http://www.xicidaili.com/wt/‘ #IP代理 http
 9 RUN_TIME = time.strftime("%Y-%m-%d %H:%M", time.localtime()) #执行时间
10 
11 #用字典存放有效ip代理
12 alive_ip = {http: [], https: []}
13 #多线程
14 pool = ThreadPool(20)
15 
16 #返回html文本
17 def get_html(url):
18     headers = {
19         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0",
20         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
21         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
22         "Accept-Encoding": "gzip, deflate",
23         "Referer": "https://www.xicidaili.com/",
24         "Connection": "keep-alive",
25         "Upgrade-Insecure-Requests": "1"
26     }
27     r = requests.get(url, headers=headers)
28     r.encoding = utf-8
29     return r.text
30 
31 #测试ip代理是否存活
32 def test_alive(proxy):
33     global alive_ip
34     proxies = {http: proxy}
35     try:
36         r = requests.get(https://www.baidu.com, proxies=proxies, timeout=3)
37         if r.status_code == 200:
38             if proxy.startswith(https):
39                 alive_ip[https].append(proxy)
40             else:
41                 alive_ip[http].append(proxy)
42     except:
43         print("%s无效!"%proxy)
44 
45 #解析html文本,获取ip代理
46 def get_alive_ip_address():
47     iplist = []
48     html = get_html(URL)
49     selector = etree.HTML(html)
50     table = selector.xpath(//table[@id="ip_list"])[0]
51     lines = table.xpath(./tr)[1:]
52     for line in lines:
53         speed, connect_time = line.xpath(.//div/@title)
54         data = line.xpath(./td)
55         ip = data[1].xpath(./text())[0]
56         port = data[2].xpath(./text())[0]
57         anonymous = data[4].xpath(./text())[0]
58         ip_type = data[5].xpath(./text())[0]
59         #过滤掉速度慢和非高匿的ip代理
60         if float(speed[:-1])>1 or float(connect_time[:-1])>1 or anonymous != 高匿:
61             continue
62         iplist.append(ip_type.lower() + :// + ip + : + port)
63     pool.map(test_alive, iplist)
64 
65 #把抓取到的有效ip代理写入到本地
66 def write_txt(output_file):
67     with open(output_file, w) as f:
68         f.write(#create time: %s\n\n % RUN_TIME)
69         f.write(http_ip_pool = \\\n)
70         f.write(str(alive_ip[http]).replace(,, ,\n))
71         f.write(\n\n)
72     with open(output_file, a) as f:
73         f.write(https_ip_pool = \\\n)
74         f.write(str(alive_ip[https]).replace(,, ,\n))
75     print(write successful: %s % output_file)
76 
77 def main():
78     get_alive_ip_address()
79     write_txt(output_file)
80 
81 if __name__ == __main__:
82     try:
83         output_file = sys.argv[1] #第一个参数作为文件名
84     except:
85         output_file = IP_POOL
86     main()

运行程序:

root@c:test$ python get_ip_proxies.py
write successful: ip_pool.py

查看文件:

root@c:test$ vim ip_pool.py
 1 #create time: 2019-03-14 19:53
 2 
 3 http_ip_pool =  4 [http://183.148.152.1:9999,
 5  http://112.85.165.234:9999,
 6  http://112.87.69.162:9999,
 7  http://111.77.197.10:9999,
 8  http://113.64.94.80:8118,
 9  http://61.184.109.33:61320,
10  http://125.126.204.82:9999,
11  http://125.126.218.8:9999,
12  http://36.26.224.56:9999,
13  http://123.162.168.192:40274,
14  http://116.209.54.125:9999,
15  http://183.148.148.211:9999,
16  http://111.177.161.111:9999,
17  http://116.209.58.245:9999,
18  http://183.148.143.38:9999,
19  http://116.209.55.218:9999,
20  http://114.239.250.15:9999,
21  http://116.209.54.109:9999,
22  http://125.123.143.98:9999,
23  http://183.6.130.6:8118,
24  http://183.148.143.166:9999,
25  http://125.126.203.228:9999,
26  http://111.79.198.74:9999,
27  http://116.209.53.215:9999,
28  http://112.87.69.124:9999,
29  http://112.80.198.13:8123,
30  http://182.88.160.16:8123,
31  http://116.209.56.24:9999,
32  http://112.85.131.25:9999,
33  http://116.209.52.234:9999,
34  http://175.165.128.223:1133,
35  http://122.4.47.199:8010,
36  http://112.85.170.204:9999,
37  http://49.86.178.206:9999,
38  http://125.126.215.187:9999]
39 
40 https_ip_pool = 41 [https://183.148.156.98:9999,
42  https://111.79.199.167:808,
43  https://61.142.72.150:39894,
44  https://119.254.94.71:42788,
45  https://221.218.102.146:33323,
46  https://122.193.246.29:9999,
47  https://183.148.139.173:9999,
48  https://60.184.194.157:3128,
49  https://118.89.138.129:52699,
50  https://112.87.71.67:9999,
51  https://58.56.108.226:43296,
52  https://182.207.232.135:50465,
53  https://111.177.186.32:9999,
54  https://58.210.133.98:32741,
55  https://115.221.116.71:9999,
56  https://183.148.140.191:9999,
57  https://183.148.130.143:9999,
58  https://116.209.54.84:9999,
59  https://125.126.219.125:9999,
60  https://112.85.167.158:9999,
61  https://112.85.173.76:9999,
62  https://60.173.244.133:41306,
63  https://183.148.147.223:9999,
64  https://116.209.53.68:9999,
65  https://111.79.198.102:9999,
66  https://123.188.5.11:1133,
67  https://60.190.66.131:56882,
68  https://112.85.168.140:9999,
69  https://110.250.65.108:8118,
70  https://221.208.39.160:8118,
71  https://116.209.53.77:9999,
72  https://116.209.58.29:9999,
73  https://183.148.141.129:9999,
74  https://124.89.33.59:53281,
75  https://116.209.57.149:9999,
76  https://58.62.238.150:32431,
77  https://218.76.253.201:61408]

之后就可以直接使用了

from ip_pool import http_ip_pool, https_ip_pool

 

python爬虫批量抓取ip代理

原文:https://www.cnblogs.com/albireo/p/10533079.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!