scrapy的增量更新(超级简单)
准备工作
需要安装
· scrapy
· redis
代码
settings.py同目录创建filters.py
import redis
from scrapy.dupefilters import BaseDupeFilter
from scrapy.utils.request import request_fingerprint
class Myfilter(BaseDupeFilter):
def __init__(self, key):
self.conn = None
self.key = key
@classmethod
def from_settings(cls, settings):
key = 'DUP_REDIS_KEY'
return cls(key)
def open(self):
self.conn = redis.Redis(host='127.0.0.1', port=6379)
def request_seen(self, request):
fp = request_fingerprint(request)
ret = self.conn.sadd(self.key, fp)
# print('重复了' if ret == 0 else '没重复')
return ret == 0
在settings.py中新增一行配置代码
DUPEFILTER_CLASS = 'baike.filters.Myfilter'