python - tool - los elementos de scrapy no son serializables JSON mientras los guardan en couchdb
scrapy python documentation (1)
items.py classes
import scrapy
from scrapy.item import Item, Field
import json
class Attributes(scrapy.Item):
description = Field()
pages=Field()
author=Field()
class Vendor(scrapy.Item):
title=Field()
order_url=Field()
class bookItem(scrapy.Item):
title = Field()
url = Field()
marketprice=Field()
images=Field()
price=Field()
attributes=Field()
vendor=Field()
time_scraped=Field()
mi scrapper
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
from scrapy.spider import BaseSpider
from scrapy import log
from scrapper.items import bookItem,Attributes,Vendor
import couchdb
import logging
import json
import time
from couchdb import Server
class libertySpider(CrawlSpider):
couch = couchdb.Server()
db = couch[''python-tests'']
name = "libertybooks"
allowed_domains = ["libertybooks.com"]
unvisited_urls = []
visited_urls = []
start_urls = [
"http://www.libertybooks.com"
]
url=["http://www.kaymu.pk"]
rules = [Rule(SgmlLinkExtractor(), callback=''parse_item'', follow=True)]
total=0
productpages=0
exceptionnum=0
def parse_item(self,response):
if response.url.find("pid")!=-1:
with open("number.html","w") as w:
self.total=self.total+1
w.write(str(self.total)+","+str(self.productpages))
itm=bookItem()
attrib=Attributes()
ven=Vendor()
images=[]
try:
name=response.xpath(''//span[@id="pagecontent_lblbookName"]/text()'').extract()[0]
name=name.encode(''utf-8'')
except:
name="name not found"
try:
price=response.xpath(''//span[@id="pagecontent_lblPrice"]/text()'').extract()[0]
price=price.encode(''utf-8'')
except:
price=-1
try:
marketprice=response.xpath(''//span[@id="pagecontent_lblmarketprice"]/text()'').extract()[0]
marketprice=marketprice.encode(''utf-8'')
except:
marketprice=-1
try:
pages=response.xpath(''//span[@id="pagecontent_spanpages"]/text()'').extract()[0]
pages=pages.encode(''utf-8'')
except:
pages=-1
try:
author=response.xpath(''//span[@id="pagecontent_lblAuthor"]/text()'').extract()[0]
author=author.encode(''utf-8'')
except:
author="author not found"
try:
description=response.xpath(''//span[@id="pagecontent_lblbookdetail"]/text()'').extract()[0]
description=description.encode(''utf-8'')
except:
description="des: not found"
try:
image=response.xpath(''//img[@id="pagecontent_imgProduct"]/@src'').extract()[0]
image=image.encode(''utf-8'')
except:
image="#"
ven[''title'']=''libertybooks''
ven[''order_url'']=response.url
itm[''vendor'']=ven
itm[''time_scraped'']=time.ctime()
itm[''title'']=name
itm[''url'']=response.url
itm[''price'']=price
itm[''marketprice'']=marketprice
itm[''images'']=images
attrib[''pages'']=pages
attrib[''author'']=author
attrib[''description'']=description
itm[''attributes'']=attrib
self.saveindb(itm)
return itm
def saveindb(self,obj):
logging.debug(obj)
self.db.save(obj)
Seguimiento de pila
2014-12-09 13:57:37-0800 [libertybooks] ERROR: Spider error processing <GET http://www.libertybooks.com/bookdetail.aspx?pid=16532>
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spiders/crawl.py", line 67, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
File "/home/asad/Desktop/scrapper/scrapper/spiders/liberty_spider.py", line 107, in parse_item
self.saveindb(itm)
File "/home/asad/Desktop/scrapper/scrapper/spiders/liberty_spider.py", line 112, in saveindb
self.db.save(obj)
File "/usr/local/lib/python2.7/dist-packages/couchdb/client.py", line 431, in save
_, _, data = func(body=doc, **options)
File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 514, in post_json
**params)
File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 533, in _request_json
headers=headers, **params)
File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 529, in _request
credentials=self.credentials)
File "/usr/local/lib/python2.7/dist-packages/couchdb/http.py", line 244, in request
body = json.encode(body).encode(''utf-8'')
File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 69, in encode
return _encode(obj)
File "/usr/local/lib/python2.7/dist-packages/couchdb/json.py", line 135, in <lambda>
dumps(obj, allow_nan=False, ensure_ascii=False)
File "/usr/lib/python2.7/json/__init__.py", line 250, in dumps
sort_keys=sort_keys, **kw).encode(obj)
File "/usr/lib/python2.7/json/encoder.py", line 207, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/lib/python2.7/json/encoder.py", line 270, in iterencode
return _iterencode(o, 0)
File "/usr/lib/python2.7/json/encoder.py", line 184, in default
raise TypeError(repr(o) + " is not JSON serializable")
exceptions.TypeError: {''attributes'': {''author'': ''Tina Fey'',
''description'': "Once in a generation a woman comes along who changes everything. Tina Fey is not that woman, but she met that woman once and acted weird around her./r/n/r/nBefore 30 Rock, Mean Girls and ''Sarah Palin'', Tina Fey was just a young girl with a dream: a recurring stress dream that she was being chased through a local airport by her middle-school gym teacher./r/n/r/nShe also had a dream that one day she would be a comedian on TV. She has seen both these dreams come true./r/n/r/nAt last, Tina Fey''s story can be told. From her youthful days as a vicious nerd to her tour of duty on Saturday Night Live; from her passionately halfhearted pursuit of physical beauty to her life as a mother eating things off the floor; from her one-sided college romance to her nearly fatal honeymoon - from the beginning of this paragraph to this final sentence./r/n/r/nTina Fey reveals all, and proves what we''ve all suspected: you''re no one until someone calls you bossy.",
''pages'': ''304 Pages''},
''images'': [],
''marketprice'': ''1,095'',
''price'': ''986'',
''time_scraped'': ''Tue Dec 9 13:57:37 2014'',
''title'': ''Bossypants'',
''url'': ''http://www.libertybooks.com/bookdetail.aspx?pid=16532'',
''vendor'': {''order_url'': ''http://www.libertybooks.com/bookdetail.aspx?pid=16532'',
''title'': ''libertybooks''}} is not JSON serializable
Soy un principiante de scrapy y couchdb, también he intentado convertir el objeto de objeto al objeto json utilizando "json.dumps (itm, default = lambda o: o. dict , sort_keys = True, indent = 4)" pero obtuve la misma respuesta, así que díganme, por favor, ¿hay alguna manera de hacer que mi clase sea serializable para que puedan almacenarse en el sofá?
Bueno, la respuesta más corta es solo usar ScrapyJSONEncoder :
from scrapy.utils.serialize import ScrapyJSONEncoder
_encoder = ScrapyJSONEncoder()
...
def saveindb(self,obj):
logging.debug(obj)
self.db.save(_encoder.encode(obj))
La versión más larga es: si tu intención es que esta araña crezca (si no se supone que sea una cosa única), puedes usar una tubería para almacenar los artículos en CouchDB y mantener las preocupaciones separadas (gatear / raspar en araña) código, almacenar en la base de datos en el código de la tubería).
Esto puede parecer una sobreingeniería al principio, pero realmente ayuda cuando un proyecto comienza a crecer y facilita las pruebas.