1 Scrapy 2 # 创建project 3 scrapy startproject wyb 4 wyb 5 wyb/ 6 spiders # 爬虫文件 7 chouti.py 8 items.py # 持久化 9 middlewares.py # 中间件 10 pipelines.py # 持久化 11 settings.py # 项目配置文件 12 scrapy.cfg # 配置文件(部署) 13 14 15 cd wyb 16 # 创建爬虫 17 scrapy genspider chouti chouti.com 18 19 # 启动爬虫 20 scrapy crawl chouti 21 scrapy crawl chouti --nolog # 不打印日志
简单实例:用scrapy爬取抽屉热榜
1 # -*- coding: utf-8 -*- 2 import sys 3 import os 4 import io 5 import scrapy 6 7 # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding=‘gb18030‘) 编码问题 8 9 10 class ChoutiSpider(scrapy.Spider): 11 name = ‘chouti‘ 12 # 爬取定向的网页 只允许这个域名的 13 allowed_domains = [‘chouti.com‘] 14 start_urls = [‘http://chouti.com/‘] 15 16 def parse(self, response): 17 from scrapy.http.response.html import HtmlResponse 18 # print(response, type(response)) 19 # print(response.text) 20 f = open(‘new.log‘, ‘a+‘) 21 item_list = response.xpath(‘//div[@id="content-list"]/div[@class="item"]‘) 22 for item in item_list: 23 text = item.xpath(‘.//a/text()‘).extract_first() 24 href = item.xpath(‘.//a/@href‘).extract_first() 25 print(href, text.strip()) 26 f.write(href+‘\n‘) 27 f.close() 28 29 page_list = response.xpath(‘//div[@id="dig_lcpage"]//a/@href‘).extract() 30 for page in page_list: 31 from scrapy.http import Request 32 page = "https://dig.chouti.com"+page 33 # 继续发请求,回调函数parse 34 yield Request(url=page, callback=self.parse)
原文:https://www.cnblogs.com/Alexephor/p/11432195.html