pip3 install Scrapy
scrapy startproject tutorial
tutorial/ scrapy.cfg tutorial/ __init__.py items.py pipelines.py settings.py spiders/ __init__.py ...
scrapy genspider quotes quotes.toscrape.com
import scrapy class QuoteItem(scrapy.Item): title = scrapy.Field() link = scrapy.Field() desc = scrapy.Field()
scrapy crawl quotes
quotes = response.css(‘.quote‘) #选择quote的区块 for quote in quotes: item = QuoteItem() item[‘text‘] = quote.css(‘.text::text‘).extract_first() item[‘author‘] = quote.css(‘.author::text‘).extract_first() item[‘tags‘] = quote.css(‘.tags .tag::text‘).extract() yield item
scrapy shell quotes.toscrape.com #可以在命令行交互
next = response.css(‘.pager .next a::attr(href)‘).extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
scrapy crawl quotes -o quotes.json #也可以保存成csv,xml等不同文件格式
from scrapy.exceptions import DropItem class TextPipeline(object): def __init__(self): self.limit = 50 def process_item(self, item, spider): if item[‘text‘]: if len(item[‘text‘]) > self.limit: item[‘text‘] = item[‘text‘][0:self.limit].rstrip() + ‘...‘ #设置长度50截断字符串 return item else: return DropItem(‘Missing Text‘)
##本系列内容为《python3爬虫开发实战》学习笔记。本系列博客列表如下:
持续更新...
原文:https://www.cnblogs.com/geo-will/p/9727020.html