Scraping all urls in a website using scrapy not retreiving complete urls associated with that domain

spider.py:

import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner


def process_links(links):
    for link in links:
        link.url = url_query_cleaner(link.url)
        yield link


class myCrawler(CrawlSpider):
    name = 'symphony'
    allowed_domains = ['laphil.com']
    start_urls = ['https://www.laphil.com']
    base_url = 'https://www.laphil.com'

    custom_settings = {
        # in order to reduce the risk of getting blocked
        'DOWNLOADER_MIDDLEWARES': {'tempbuffer.middlewares.RotateUserAgentMiddleware': 400, },
        'COOKIES_ENABLED': False,
        'CONCURRENT_REQUESTS': 6,
        'DOWNLOAD_DELAY': 2,

        # This settings are a must:
        
        # Duplicates pipeline
        'ITEM_PIPELINES': {'tempbuffer.pipelines.DuplicatesPipeline': 300},

        # In order to create a CSV file:
        'FEEDS': {'csv_file.csv': {'format': 'csv'}}
    }

    rules = (
        Rule(
            LinkExtractor(allow_domains='laphil.com'),
            process_links=process_links,
            callback='parse_item',
            follow=True
        ),
    )

    def parse_item(self, response):
        yield {
            'url': response.url
        }

        # see if you really need this loop (since you're parsing all the urls in the domain anyway, and you'll need
        # to filter all those duplicates):

        all_urls = response.css('a::attr(href)').getall()

        # In order to change from relative to absolute url in the pipeline:
        self.base_url = response.url

        for url in all_urls:
            yield {
                'url': url
            }

pipeline.py:

from scrapy.exceptions import DropItem


class DuplicatesPipeline:
    def __init__(self):
        self.scraped_urls = set()

    def process_item(self, item, spider):
        url = item['url'].strip()

        # if it's a relative url then convert to absolute url
        if 'http' not in url:
            url = spider.base_url + url
            item['url'] = url

        if url in self.scraped_urls:
            raise DropItem(f'Duplicate url: \"{url}\"')
        else:
            self.scraped_urls.add(url)
            return item

I didn't add the middlewares.py file since it's not a necessary, and it has a lot of lines.
In your code instead of tempbuffer it should be your project's name (you'll need to replace it in the custom_settings for the pipeline).
I added to the rules the domain as the only allowed domain so you won't scrape anything you don't want.
I verified that the duplicates pipeline works, and that it actually creates a csv file with one column.

Scraping all urls in a website using scrapy not retreiving complete urls associated with that domain

Related

Recent Posts