Scrapy: Debug Redirecting (301)

It's just redirected from http to https so there's no problem there.

Your xpath is completely wrong. I fixed it in parse, and I fixed 3 xpaths in parse_detail as an example, but you need to fix the rest of them.

import scrapy


class OlxSpider(scrapy.Spider):
    name = "olx"
    allowed_domains = ["pe.olx.com.br"]
    start_urls = (
        'http://pe.olx.com.br/imoveis/aluguel',
    )

    def parse(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        items = response.xpath('//ul[@id="ad-list"]/li')

        for item in items:
            url = item.xpath('.//a/@href').get()
            if url:
                yield scrapy.Request(url=url, callback=self.parse_detail)

        next_page = response.xpath('//a[@data-lurker-detail="next_page"]/@href').get()
        if next_page:
            self.log('Next Page: {0}'.format(next_page))
            yield scrapy.Request(url=next_page, callback=self.parse)

    def parse_detail(self, response):
        self.log(u'Imóvel URL: {0}'.format(response.url))
        item = {}
        item['photos'] = response.xpath('//img[@class="image "]/@src').get()
        item['url'] = response.url
        item['address'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-location")]'
            '//.)'
        ).extract_first()
        item['title'] = response.xpath('//h1/text()').get()
        item['price'] = response.xpath('//h2/text()').get()
        item['details'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-description")]'
            '//.)'
        ).extract_first()
        item['source_id'] = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-id")]//strong//.)'
        ).extract_first()
        date = response.xpath(
            'normalize-space(//div[contains(@class,"OLXad-date")]//.)'
        ).re("Inserido em: (.*).")
        item['date'] = (date and date[0]) or ''
        yield item

Scrapy: Debug Redirecting (301)

Related

Recent Posts