DUPEFILTER_CLASS = ‘scrapy.dupefilter.RFPDupeFilter‘ DUPEFILTER_CLASS = False JOBDIR = "保存范文记录的日志路径,如:/root/" # 最终路径为 /root/requests.seen
DUPEFILTER_CLASS = ‘替换为自己的去重类规则‘
import os import logging from scrapy.utils.job import job_dir from scrapy.utils.request import referer_str, request_fingerprint class RFPDupeFilter(BaseDupeFilter): """Request Fingerprint duplicates filter""" def __init__(self, path=None, debug=False): self.file = None self.fingerprints = set() # 默认指纹为set类型,可改写 self.logdupes = True self.debug = debug self.logger = logging.getLogger(__name__) if path: self.file = open(os.path.join(path, ‘requests.seen‘), ‘a+‘) self.file.seek(0) self.fingerprints.update(x.rstrip() for x in self.file) @classmethod def from_settings(cls, settings): # 不同于from_crawler中的crawler,这里直接是settings debug = settings.getbool(‘DUPEFILTER_DEBUG‘) return cls(job_dir(settings), debug) def request_seen(self, request): # 判断进入scheduler中的Request,是否已经出现,出现返回True,未出现就将其添加并返回False fp = self.request_fingerprint(request) if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + ‘\n‘) def request_fingerprint(self, request): # 该request_fingerprint为定义指纹特征 # 来自:scrapy.utils.request.request_fingerprint """ def request_fingerprint(request, include_headers=None, keep_fragments=False): if include_headers: # 取请求头 include_headers = tuple(to_bytes(h.lower()) for h in sorted(include_headers)) cache = _fingerprint_cache.setdefault(request, {}) cache_key = (include_headers, keep_fragments) if cache_key not in cache: fp = hashlib.sha1() # sha1加密 fp.update(to_bytes(request.method)) fp.update(to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))) fp.update(request.body or b‘‘) if include_headers: for hdr in include_headers: if hdr in request.headers: fp.update(hdr) for v in request.headers.getlist(hdr): fp.update(v) cache[cache_key] = fp.hexdigest() return cache[cache_key] """ return request_fingerprint(request) def close(self, reason): if self.file: self.file.close() def log(self, request, spider): if self.debug: msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)" args = {‘request‘: request, ‘referer‘: referer_str(request)} self.logger.debug(msg, args, extra={‘spider‘: spider}) elif self.logdupes: msg = ("Filtered duplicate request: %(request)s" " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") self.logger.debug(msg, {‘request‘: request}, extra={‘spider‘: spider}) self.logdupes = False spider.crawler.stats.inc_value(‘dupefilter/filtered‘, spider=spider)
原文:https://www.cnblogs.com/nuochengze/p/13377112.html