首页 > 其他 > 详细

scrapy在重复爬取的时候删除掉之前爬的旧数据

时间:2020-03-19 00:37:46      阅读:182      评论:0      收藏:0      [点我收藏+]

问题:想在启动scrapy后重复爬取某一天的数据,但是爬取之前需要删除掉之前的旧数据,在哪里实现删除呢?

可以在pipeline的open_spider(self,spider)中删除,则在爬虫启动的时候会删除。

以下是pipelines.py 文件

 

# -*- coding: utf-8 -*-
import sys
sys.path.append("/apps/jr_python/riskspiders")
from riskspiders.utils import DButil
from riskspiders.settings import DATABASE_PRM
import logging
import hashlib
logger = logging.getLogger(__name__)
# Define your item pipelines here
#
# Don‘t forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class RiskspidersPipeline(object):
    # 连接数据库,只要类一初始化,就已经连好了数据库
    db = DButil(DATABASE_PRM)
    def process_item(self, item, spider):
        return item

class RiskspidersMySQLPipeline(object):

    # 连接数据库,只要类一初始化,就已经连好了数据库
    # def __init__(self):
    #     self.md = hashlib.md5()
    def open_spider(self, spider):
        print("open_spider, %s" % spider.name)
        self.db = DButil(DATABASE_PRM)
        for day in spider.day_list:
            sql_del = """delete from riskinfo where spider = ‘{}‘ and release_time = ‘{}‘;""".format(spider.name,day)
            try:
                self.db.execute(sql_del)
            except Exception as e:
                print(e)

    def close_spider(self,spider):
        self.db.close()
        # 以下可以打印大部分的数据收集,但是finish_time等不能输出,因为程序还没有运行完
        print(spider.crawler.stats.get_value())

    def process_item(self,item,spider):
        db = DButil(DATABASE_PRM)

        # 逐条插入,更新插入
        if spider.name == hexun_bankdata:
            # print(‘***** item_bank insert MySQL‘)
            logger.info(***** item_bank insert MySQL)

            pa = (
            item["source"], item["spider"],item[website_menu], item["disclosure_period"], item["bank_abbreviation"], item["total_assets"],
            item["capital_adequancy_ratio"], item["core_capital_adequancy_ratio"], item["bad_loan_ratio"],
            item["provision_coverage"], item["url"], item["cra_time"], item["cra_time"])
            sql_data =                 """insert into hexun_bankdata(source,spider,website_menu,disclosure_period, bank_abbreviation,total_assets,capital_adequancy_ratio,core_capital_adequancy_ratio,bad_loan_ratio,provision_coverage,url,cra_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)  on duplicate key update  cra_time = %s;"""

            try:
                db.execute(sql_data, pa)
            except Exception as e:
                print e
                logger.error(e)
            finally:
                db.close()
        else:
            md = hashlib.md5()
            str1 = %s%s % (item[title], item[content])
            md.update(str1)
            md_value = md.hexdigest()
            # print("str1 is %s,md_value is %s" % (str1,md_value))
            logger.info(***** item_bank insert MySQL)
            params = (
            item[source], item[spider],item[website_menu], item[release_time], item[key_words], item[neg_key_words], item[title].strip(),
            item[source_type], item[f_name], item[is_include_tbl], item[content].strip(), item[content_web], item[url],
            item[father_url], item[cra_time], md_value, item[cra_time]
            )
            try:
                db.execute(
                """
                insert into riskinfo
                (source, spider,website_menu, release_time, key_words,neg_key_words, title, source_type,f_name, is_include_tbl,content,content_web, url,father_url,cra_time,content_id)
                values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update  cra_time = %s;
                """, params
                )
            except Exception as e:
                print e
                logger.error(e)
            finally:
                db.close()

 

scrapy在重复爬取的时候删除掉之前爬的旧数据

原文:https://www.cnblogs.com/yoyowin/p/12521207.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!