问题描述:

I have succesfully merge django and scrapy, and want to persist my items object into database.

Saving works fine, but whithout all the element.

I'm pretty new on python, scrapy and django and I figure I miss something, but can't solve it.

Here is my spider code :

from scrapy.http import FormRequest, Request

from scrapy.spider import BaseSpider

from scrapy.selector import HtmlXPathSelector

from scrapy import log

from scrapy.contrib.loader import XPathItemLoader

from datacrowdscrapy.items import DatacrowdItem

class DatacrowdSpider(BaseSpider):

name = 'datacrowd'

start_urls = ['https://www.exemple.com/login']

def parse(self, response):

parsed = [FormRequest.from_response(

response,

formdata={

'login': '[email protected]',

'password': 'password'

},

callback=self.after_login)]

return parsed

def after_login(self, response):

# check login succeed before going on

if "authentication failed" in response.body:

self.log("Login failed", level=log.ERROR)

return

selector = HtmlXPathSelector(response)

investmentsLinks = selector.select('//a[contains(@class, "myClass")]/@href').extract()

for link in investmentsLinks:

curDatacrowdItem = XPathItemLoader(item=DatacrowdItem(), response=response)

curDatacrowdItem.add_value('url', link)

curRequest = Request(url=link, callback=self.parse_investments, meta={'item': curDatacrowdItem})

yield curRequest

def parse_investments(self, response):

selector = HtmlXPathSelector(response)

curDatacrowdItem = response.meta['item']

# Details

details = selector.select('//td/div[contains(@class, "myClass")]/text()').extract()

curDatacrowdItem.add_value('someVal', details[0].strip())

/* ... */

# Get nbInvestors

investorLink = selector.select('//ul[contains(@id, "myId")]/li/@onclick').re(r'window.location.href=\'(http.+/data.+)\'')

curRequest = Request(url=investorLink[0], callback=self.parse_investors, meta={'item': curDatacrowdItem})

yield curRequest

# Get last company details

detailsLink = selector.select('//ul[contains(@id, "myData")]/li/@onclick').re(r'window.location.href=\'(http.+/company-details.+)\'')

curRequest = Request(url=detailsLink[0], callback=self.parse_details, meta={'item': curDatacrowdItem})

yield curRequest

def parse_investors(self, response):

selector = HtmlXPathSelector(response)

curDatacrowdItem = response.meta['item']

nbInvestors = len(selector.select('//ul/li[contains(@class, "myClass")]'))

curDatacrowdItem.add_value('nbInvestors', nbInvestors)

return curDatacrowdItem

def parse_details(self, response):

selector = HtmlXPathSelector(response)

curDatacrowdItem = response.meta['item']

# Company name

name = selector.select('//div[contains(@class, "myClass")]/h2/text()').extract()

curDatacrowdItem.add_value('name', name[0].strip())

item = curDatacrowdItem.load_item()

item.save() # Here I'm persisiting datas

return item

I get an error log like that :

[datacrowd] ERROR: Spider must return Request, BaseItem or None, got 'XPathItemLoader' in <GET http://www.exemple.com/url/slug>

Any idea about what I'm doing wrong ?

Cheers,

Snite

网友答案:

Simply because you are yielding an XPathItemLoader and not an Item.

In your method "after_login", you're adding an XPathItemLoader objects in the meta, which you try to yield later.

Use the load_item method to return the item.

meta={'item': curDatacrowdItem.load_item()}

You should rename your variables to avoid these mistakes :)

相关阅读:
Top