how to filter duplicate requests based on url in scrapy
You can write custom middleware for duplicate removal and add it in settings
import os
from scrapy.dupefilter import RFPDupeFilter
class CustomFilter(RFPDupeFilter):
"""A dupe filter that considers specific ids in the url"""
def __getid(self, url):
mm = url.split("&refer")[0] #or something like that
return mm
def request_seen(self, request):
fp = self.__getid(request.url)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
Then you need to set the correct DUPFILTER_CLASS in settings.py
DUPEFILTER_CLASS = 'scraper.duplicate_filter.CustomFilter'
It should work after that
Following ytomar's lead, I wrote this filter that filters based purely on URLs that have already been seen by checking an in-memory set. I'm a Python noob so let me know if I screwed something up, but it seems to work all right:
from scrapy.dupefilter import RFPDupeFilter
class SeenURLFilter(RFPDupeFilter):
"""A dupe filter that considers the URL"""
def __init__(self, path=None):
self.urls_seen = set()
RFPDupeFilter.__init__(self, path)
def request_seen(self, request):
if request.url in self.urls_seen:
return True
else:
self.urls_seen.add(request.url)
As ytomar mentioned, be sure to add the DUPEFILTER_CLASS
constant to settings.py
:
DUPEFILTER_CLASS = 'scraper.custom_filters.SeenURLFilter'
https://github.com/scrapinghub/scrapylib/blob/master/scrapylib/deltafetch.py
This file might help you. This file creates a database of unique delta fetch key from the url ,a user pass in a scrapy.Reqeust(meta={'deltafetch_key':uniqe_url_key}). This this let you avoid duplicate requests you already have visited in the past.
A sample mongodb implementation using deltafetch.py
if isinstance(r, Request):
key = self._get_key(r)
key = key+spider.name
if self.db['your_collection_to_store_deltafetch_key'].find_one({"_id":key}):
spider.log("Ignoring already visited: %s" % r, level=log.INFO)
continue
elif isinstance(r, BaseItem):
key = self._get_key(response.request)
key = key+spider.name
try:
self.db['your_collection_to_store_deltafetch_key'].insert({"_id":key,"time":datetime.now()})
except:
spider.log("Ignoring already visited: %s" % key, level=log.ERROR)
yield r
eg. id = 345 scrapy.Request(url,meta={deltafetch_key:345},callback=parse)
Here is my custom filter base on scrapy 0.24.6.
In this filter, it only cares id in the url. for example
http://www.example.com/products/cat1/1000.html?p=1
http://www.example.com/products/cat2/1000.html?p=2
are treated as same url. But
http://www.example.com/products/cat2/all.html
will not.
import re
import os
from scrapy.dupefilter import RFPDupeFilter
class MyCustomURLFilter(RFPDupeFilter):
def _get_id(self, url):
m = re.search(r'(\d+)\.html', url)
return None if m is None else m.group(1)
def request_fingerprint(self, request):
style_id = self._get_id(request.url)
return style_id