创建Scrapy项目
1 # https://github.com/My-Sun-Shine/Python/tree/master/Python3/Scrapy_Learn/Scrapy_A 2 scrapy startproject Scrapy_A
# 创建quotes.py这个Spider scrapy genspider quotes quotes.toscrape.com # quotes.py # -*- coding: utf-8 -*- import scrapy class QuotesSpider(scrapy.Spider): name = ‘quotes‘ # 项目的唯一名字,区分不同的Spider allowed_domains = [‘quotes.toscrape.com‘] # 允许爬取的域名,如果初始或后续的请求链接不是这个域名下的,则请求链接会被过滤掉 start_urls = [‘http://quotes.toscrape.com/‘] # 爬虫启动时爬取的url列表,初始请求由它来定义 def parse(self, response): # 该方法负责解析返回的响应、提取数据或者进一步生成要处理的请求 """默认情况下,在start_urls里面的链接构成的请求完成下载执行后,返回的响应就会作为唯一参数传给该函数""" pass
# items.py import scrapy class QuoteItem(scrapy.Item): """创建Item需要继承scrapy.Item类,定义类型scrapy.Field字段""" text = scrapy.Field() author = scrapy.Field() tags = scrapy.Field()
# quotes.py import scrapy from Scrapy_A.items import QuoteItem class QuotesSpider(scrapy.Spider): name = ‘quotes‘ allowed_domains = [‘quotes.toscrape.com‘] start_urls = [‘http://quotes.toscrape.com/‘] def parse(self, response): """response是start_urls里面的链接爬取后的结果,使用parse()方法进行解析,使用CSS选择器或者XPath选择器""" quotes = response.css(‘.quote‘) for quote in quotes: item = QuoteItem() # 声明数据类Item # extract_first()方法获取第一个元素;extract()方法获取所有结果组成的列表 item[‘text‘] = quote.css(‘.text::text‘).extract_first() item[‘text‘] = quote.css(‘.author::text‘).extract_first() item[‘tags‘] = quote.css(‘.tags .tag::text‘).extract() yield item next_page = response.css(‘.pager .next a::attr("href")‘).extract_first() # 获取下一页 url = response.urljoin(next_page) # url:请求链接;callback:回调函数,当得到url响应的时候,回调parse()方法 yield scrapy.Request(url=url, callback=self.parse)
scrapy crawl quotes # 爬取结果显示到控制台上 scrapy crawl quotes -o quotes.json # 爬取结果保存在JSON文件中 scrapy crawl quotes -o quotes.jl # 每一个Item输出一行JSON,jl是jsonlines的缩写 scrapy crawl quotes -o quotes.jsonlines # 每一个Item输出一行JSON scrapy crawl quotes -o quotes.csv # 爬取结果保存在CSV文件中 scrapy crawl quotes -o quotes.xml # 爬取结果保存在XML文件中 scrapy crawl quotes -o quotes.pickle # 爬取结果保存在Pickle文件中 scrapy crawl quotes -o quotes.marshal # 爬取结果保存在marshal格式文件中 # ftp远程输出,需要配置用户名,密码,地址,输出路径 scrapy crawl quotes -o ftp://user:pass@ftp.example.com/path/to/quotes.csv
# pipelines.py # -*- coding: utf-8 -*- from scrapy.exceptions import DropItem import pymongo class TextPipeline(object): def __init__(self): self.limit = 50 def process_item(self, item, spider): """该方法必须返回包含数据的字典或Item对象或者抛出异常;item:每次爬虫生成的Item对象;spider:爬虫实例""" if item[‘text‘]: if len(item[‘text‘]) > self.limit: item[‘text‘] = item[‘text‘][0:self.limit].strip() + ‘...‘ return item else: return DropItem("Missing Text") class MongoPipeline(object): # 存入数据库 def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri # 链接 self.mongo_db = mongo_db # 数据库名 @classmethod # 标识这是一个依赖注入的方式 def from_crawler(cls, crawler): """:param crawler: 得到全局配置的每个配置信息来自settings.py""" return cls( mongo_uri=crawler.settings.get(‘MONGO_URI‘), mongo_db=crawler.settings.get(‘MONGO_DB‘) ) def open_spider(self, spider): # 当爬虫开启的时候,这个方法被调用 self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): # 实现数据插入 name = item.__class__.__name__ self.db[name].insert(dict(item)) return item def close_spider(self, spider): # 爬虫关闭的时候,该方法被调用 self.client.close()
# settings.py # 键名是Pipeline的类名称,键值是调用优先级,数字越小则对应的Pipeline越先被调用 ITEM_PIPELINES = { ‘Scrapy_A.pipelines.TextPipeline‘: 300, ‘Scrapy_A.pipelines.MongoPipeline‘: 400, } MONGO_URL = ‘localhost‘ MONGO_DB = ‘Scrapy_A‘
from:https://www.cnblogs.com/My-Sun-Shine/p/13551128.html
原文:https://www.cnblogs.com/xuan52rock/p/14386018.html