Scrapy: Debug Redirecting (301)
It's just redirected from http
to https
so there's no problem there.
Your xpath is completely wrong. I fixed it in parse
, and I fixed 3 xpaths in parse_detail
as an example, but you need to fix the rest of them.
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[@id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/@href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[@data-lurker-detail="next_page"]/@href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[@class="image "]/@src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(@class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item