概念:检测网站数据更新的情况。爬取到最新更新出来的数据。
核心:去重
记录表:需要持久化存储。redis中set
代码实现:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from moviePro.items import MovieproItem
class MovieSpider(CrawlSpider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
conn = Redis(host='127.0.0.1',port=6379)
link = LinkExtractor(allow=r'frim/index1-\d+\.html')#提取页码链接
rules = (
Rule(link, callback='parse_item', follow=False),
)
def parse_item(self, response):
#电影名称+详情页的url
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
name = li.xpath('./div/a/@title').extract_first()
item = MovieproItem()
item['name'] = name
detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first()
ex = self.conn.sadd('movie_record',detail_url)
if ex == 1:#这部电影之前没有存在于记录表中
print('有最新更新的数据!!!!!!')
yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item})
else:
print('暂无新数据的更新......')
def parse_detail(self,response):
item = response.meta['item']
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['desc'] = desc
yield item
pipelines.py文件的代码:
class MovieproPipeline(object):
def process_item(self, item, spider):
conn = spider.conn
conn.lpush('movieData',item)
return item
原文:https://www.cnblogs.com/zhufanyu/p/12020539.html