首页 > 其他 > 详细

Scrapy 框架 增量式

时间:2019-04-22 13:19:32      阅读:113      评论:0      收藏:0      [点我收藏+]

增量式:

  • 用来检测网站中数据的更新情况

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from redis import Redis


class DianyingSpider(CrawlSpider):
    """
    www.4567tv.tv
    """
    name = 'dianying'
    # allowed_domains = ['https://www.4567tv.tv/index.php/vod/show/id/1/page/388.html']
    start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/8/page/1.html']
    link = LinkExtractor(allow=r'/index.php/vod/show/id/8/page/\d+\.html')
    rules = (
        Rule(link, callback='parse_item', follow=True),
    )
    conn = Redis(host='127.0.0.1', port=6379)

    def parse_item(self, response):
        li_list = response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]')

        for li in li_list:
            detail_url = 'https://www.4567tv.tv' + li.xpath('./div/a/@href').extract_first()

            if_num = self.conn.sadd('dianying', detail_url)
            print(if_num)
            if if_num:
                print('有最新数据的更新......')
                # yield scrapy.Request(url=detail_url, callback=self.detail_callback)
            else:
                print('暂无最新数据可爬取......')

    def detail_callback(self, response):
        title = response.xpath('//h1/text()').extract_first()
        zhuyan = response.xpath('//div[@class="stui-content__detail"]/p[2]//text()').extract()
        print(title, zhuyan)

对于文本内容 使用

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from qiubaiPro.items import QiubaiproItem
import hashlib
class QiubaiSpider(CrawlSpider):
    name = 'qiubai'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['https://www.******.com/text/']
    conn = Redis(host='127.0.0.1',port=6379)
    rules = (
        Rule(LinkExtractor(allow=r'/text/page/\d+/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        # print(response)
        div_list = response.xpath('//div[@id="content-left"]/div')

        for div in div_list:
            item = QiubaiproItem()
            item['author'] = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
            item['content'] = div.xpath('.//div[@class="content"]/span//text()').extract()
            item['content'] = ''.join(item['content'])
            data = item['author']+item['content']
            #对数据生成一个数据指纹
            data_hash = hashlib.sha256(data.encode()).hexdigest()
            ex = self.conn.sadd('if_data',data_hash)
            if ex == 1:
                print('数据更新,可爬......')
                yield item
            else:
                print('暂无更新数据......')

Scrapy 框架 增量式

原文:https://www.cnblogs.com/zhang-zi-yi/p/10749454.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!