首页 > 其他 > 详细

爬取煎蛋XXOO妹子网

时间:2018-05-15 00:52:21      阅读:221      评论:0      收藏:0      [点我收藏+]

今天回忆廖大的多线程的时候,看到下面有人写了个多线程的爬虫,点进去看了下,分析的很仔细,写了接近200行代码吧

让后我就研究了一下这个网站,emmmm,selenium + PhantomJS不就直接搞定了嘛,然后就写了段code:

然后发现,哇,selenium不支持PhantomJS了,因为chrome和firefox自带了headless的访问,然后就去各个blog看,最后爬下了这个网站:

 1 import unittest
 2 import requests
 3 import time
 4 import re
 5 from random import randint
 6 from selenium import webdriver
 7 from selenium.webdriver.chrome.options import Options
 8 from selenium.webdriver.common.keys import Keys
 9 
10 class ooxx_spider(unittest.TestCase):
11 
12     def setUp(self):
13         chrome_options = Options()
14         chrome_options.add_argument(--headless)
15         chrome_options.add_argument(--disable-gpu)
16         self.driver = webdriver.Chrome(E:/chromedriver.exe, chrome_options=chrome_options)
17 
18     def test_spider(self):
19         for i in range(1, 80):
20             url = http://jandan.net/ooxx/ + page- + str(i)
21             self.driver.get(url)
22             print(url)
23             elem = self.driver.find_elements_by_xpath(//*[@class="commentlist"]/li/div/div/div/p/img)#/li/div/div/div/p/img
24             for j in elem:
25                 self.save_img(j.get_attribute(src))
26             print(第{}页爬取成功.format(i))
27 
28     def save_img(self, res):
29         suffix = res.split(.)[-1]
30         destination = picture/ + str(randint(1, 1000)) + str(randint(1, 1000)) + .+ suffix
31         r = requests.get(res)
32         with open(destination, wb) as f:
33             f.write(r.content)
34 
35     def tearDown(self):
36         self.driver.close()
37 
38 if __name__ == __main__:
39     unittest.main()
 1 import unittest
 2 import requests
 3 import time
 4 import re
 5 from random import randint
 6 from selenium import webdriver
 7 from selenium.webdriver.chrome.options import Options
 8 from selenium.webdriver.common.keys import Keys
 9 
10 class ooxx_spider(unittest.TestCase):
11 
12     def setUp(self):
13         chrome_options = Options()
14         chrome_options.add_argument(--headless)
15         chrome_options.add_argument(--disable-gpu)
16         self.driver = webdriver.Chrome(E:/chromedriver.exe, chrome_options=chrome_options)
17 
18     def test_spider(self):
19         for i in range(1, 80):
20             url = http://jandan.net/ooxx/ + page- + str(i)
21             self.driver.get(url)
22             print(url)
23             elem = self.driver.find_elements_by_xpath(//*[@class="commentlist"]/li/div/div/div/p/img)#/li/div/div/div/p/img
24             for j in elem:
25                 self.save_img(j.get_attribute(src))
26             print(第{}页爬取成功.format(i))
27 
28     def save_img(self, res):
29         suffix = res.split(.)[-1]
30         destination = picture/ + str(randint(1, 1000)) + str(randint(1, 1000)) + .+ suffix
31         r = requests.get(res)
32         with open(destination, wb) as f:
33             f.write(r.content)
34 
35     def tearDown(self):
36         self.driver.close()
37 
38 if __name__ == __main__:
39     unittest.main()

明天补上多线程的代码

爬取煎蛋XXOO妹子网

原文:https://www.cnblogs.com/ducklu/p/9038770.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!