Scraping all urls in a website using scrapy not retreiving complete urls associated with that domain
spider.py:
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner
def process_links(links):
for link in links:
link.url = url_query_cleaner(link.url)
yield link
class myCrawler(CrawlSpider):
name = 'symphony'
allowed_domains = ['laphil.com']
start_urls = ['https://www.laphil.com']
base_url = 'https://www.laphil.com'
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'tempbuffer.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 2,
# This settings are a must:
# Duplicates pipeline
'ITEM_PIPELINES': {'tempbuffer.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='laphil.com'),
process_links=process_links,
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
yield {
'url': response.url
}
# see if you really need this loop (since you're parsing all the urls in the domain anyway, and you'll need
# to filter all those duplicates):
all_urls = response.css('a::attr(href)').getall()
# In order to change from relative to absolute url in the pipeline:
self.base_url = response.url
for url in all_urls:
yield {
'url': url
}
pipeline.py:
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.scraped_urls = set()
def process_item(self, item, spider):
url = item['url'].strip()
# if it's a relative url then convert to absolute url
if 'http' not in url:
url = spider.base_url + url
item['url'] = url
if url in self.scraped_urls:
raise DropItem(f'Duplicate url: \"{url}\"')
else:
self.scraped_urls.add(url)
return item
- I didn't add the
middlewares.py
file since it's not a necessary, and it has a lot of lines. - In your code instead of
tempbuffer
it should beyour project's name
(you'll need to replace it in the custom_settings for the pipeline). - I added to the rules the domain as the only allowed domain so you won't scrape anything you don't want.
- I verified that the duplicates pipeline works, and that it actually creates a csv file with one column.