Scraping all urls in a website using scrapy not retreiving complete urls associated with that domain

import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner

def process_links(links):
    for link in links:
        link.url = url_query_cleaner(link.url)
        yield link

class myCrawler(CrawlSpider):
    name = 'symphony'
    allowed_domains = ['']
    start_urls = ['']
    base_url = ''

    custom_settings = {
        # in order to reduce the risk of getting blocked
        'DOWNLOADER_MIDDLEWARES': {'tempbuffer.middlewares.RotateUserAgentMiddleware': 400, },
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 2,

        # This settings are a must:
        # Duplicates pipeline
        'ITEM_PIPELINES': {'tempbuffer.pipelines.DuplicatesPipeline': 300},

        # In order to create a CSV file:
        'FEEDS': {'csv_file.csv': {'format': 'csv'}}

    rules = (

    def parse_item(self, response):
        yield {
            'url': response.url

        # see if you really need this loop (since you're parsing all the urls in the domain anyway, and you'll need
        # to filter all those duplicates):

        all_urls = response.css('a::attr(href)').getall()

        # In order to change from relative to absolute url in the pipeline:
        self.base_url = response.url

        for url in all_urls:
            yield {
                'url': url

from scrapy.exceptions import DropItem

class DuplicatesPipeline:
    def __init__(self):
        self.scraped_urls = set()

    def process_item(self, item, spider):
        url = item['url'].strip()

        # if it's a relative url then convert to absolute url
        if 'http' not in url:
            url = spider.base_url + url
            item['url'] = url

        if url in self.scraped_urls:
            raise DropItem(f'Duplicate url: \"{url}\"')
            return item
  1. I didn't add the file since it's not a necessary, and it has a lot of lines.
  2. In your code instead of tempbuffer it should be your project's name (you'll need to replace it in the custom_settings for the pipeline).
  3. I added to the rules the domain as the only allowed domain so you won't scrape anything you don't want.
  4. I verified that the duplicates pipeline works, and that it actually creates a csv file with one column.