# middlewares.py from scrapy import signals import scrapy from selenium import webdriver from selenium.webdriver.chrome.options import Options import time class ChromedriverMiddleware(object): def process_request(self, request, spider): chrome_options = Options() chrome_options.add_argument(‘--headless‘) # 使用无头谷歌浏览器模式 chrome_options.add_argument(‘--disable-gpu‘) chrome_options.add_argument(‘--no-sandbox‘) # 指定谷歌浏览器路径 self.driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=‘/root/zx/spider/driver/chromedriver‘) if request.url != ‘https://www.aqistudy.cn/historydata/‘: self.driver.get(request.url) time.sleep(1) html = self.driver.page_source self.driver.quit() return scrapy.http.HtmlResponse(url=request.url, body=html.encode(‘utf-8‘), encoding=‘utf-8‘, request=request) # setting.py DOWNLOADER_MIDDLEWARES = { ‘driver.middlewares.ChromedriverMiddleware‘: 543, }
# middlewares.py from scrapy.utils.python import to_bytes import base64 class ProxyMiddleware(object): def process_request(self, request, spider): # 全部请求都使用代理 PROXIES = [ {‘ip_port‘: ‘111.11.228.75:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘120.198.243.22:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘111.8.60.9:8123‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘101.71.27.120:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘122.96.59.104:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘122.224.249.122:8088‘, ‘user_pass‘: ‘root@123456‘}, ] proxy = random.choice(PROXIES) if proxy[‘user_pass‘] is not None: request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘]) encoded_user_pass = base64.b64encode(to_bytes(proxy[‘user_pass‘])) request.headers[‘Proxy-Authorization‘] = to_bytes(‘Basic ‘ + encoded_user_pass) else: request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘]) def process_exception(self, request, response, spider): # 请求报错是执行代理 def process_request(self, request, spider): PROXIES = [ {‘ip_port‘: ‘111.11.228.75:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘120.198.243.22:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘111.8.60.9:8123‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘101.71.27.120:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘122.96.59.104:80‘, ‘user_pass‘: ‘root@123456‘}, {‘ip_port‘: ‘122.224.249.122:8088‘, ‘user_pass‘: ‘root@123456‘}, ] proxy = random.choice(PROXIES) if proxy[‘user_pass‘] is not None: request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘]) encoded_user_pass = base64.b64encode(to_bytes(proxy[‘user_pass‘])) request.headers[‘Proxy-Authorization‘] = to_bytes(‘Basic ‘ + encoded_user_pass) else: request.meta[‘proxy‘] = to_bytes("http://%s" % proxy[‘ip_port‘]) # setting.py DOWNLOADER_MIDDLEWARES = { ‘driver.middlewares.ProxyMiddleware‘: 543, # 设置不参与scrapy的自动重试的动作 请求出错时才使用代理 ‘scrapy.downloadermiddlewares.retry.RetryMiddleware‘: None } # 禁止重试 RETRY_ENABLED = False # 下载超时 DOWNLOAD_TIMEOUT = 10 # 下载重试次数 RETRY_TIMES=5 # spider.py 在请求中使用代理 yield scrapy.FormRequest(self.start_url, method="POST", formdata=self.form_data,meta={‘proxy‘:"http://59.44.247.194:9797"})
原文:https://www.cnblogs.com/yoyo1216/p/11492646.html