scrapy - selectors - ¿Por qué mi CrawlerProcess no tiene la función "crawl"?
show scrapy version (1)
Existe (¿hubo?) Un problema de compatibilidad entre Scrapy y Scrapyd. Necesitaba ejecutar Scrapy 0.24 y Scrapyd 1.0.1. Aquí está el problema en Github https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = ''example''
allowed_domains = [''www.example.com'']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = [''http://www.example.com/washington/?page=%s'' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=(''//a[@name="listing_link"]'')),
callback=''parse_item''),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item[''title''] = response.xpath(''string(//h2[@class="post-title"]/text())'').extract()
item[''desc''] = response.xpath(''string(//div[@class="section post-body"]/text())'').extract()
item[''url''] = response.url
item[''location''] = response.xpath(''string(//div[@class="posting"]/div[2]/text())'').extract()
item[''posted_date''] = response.xpath(''string(//div[@class="post-date"]/span/text())'').extract()#.re("(?<=Posted/s*).*")
item[''crawled_date''] = time.strftime("%c")
# not sure how to get the other image urls right now
item[''image_urls''] = response.xpath(''string(//div[@class="section post-contact-container"]/div/div/img/@src)'').extract()
# I can''t find this section on any pages right now
item[''other_ad_urls''] = response.xpath(''//a[@name="listing_link"]/@href'').extract()
item[''phone_number''] = "".join(response.xpath(''//div[@class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()'').extract())
item[''email''] = "".join(response.xpath(''//div[@class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()'').extract())
item[''website''] = "".join(response.xpath(''//div[@class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()'').extract())
item[''name''] = response.xpath(''//div[@class="post-name"]/text()'').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# ''ITEM_PIPELINES'': {
# #''scrapy.contrib.pipeline.images.ImagesPipeline'': 1
# ''backpage.pipelines.GeolocationPipeline'': 4,
# ''backpage.pipelines.LocationExtractionPipeline'': 3,
# ''backpage.pipelines.BackpagePipeline'': 5
# }
# });
process1 = CrawlerProcess({
''USER_AGENT'': ''Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)''
})
process1.crawl(MySpider)
process1.start()
Mi araña funciona perfectamente cuando la ejecuto desde la línea de comando con
scrapy crawl example
pero tendré que ejecutar varias arañas, por lo que quiero ponerlas todas en una secuencia de comandos y usar CrawlerProcess. Cuando intento ejecutar esto obtengo el error,
AttributeError: ''CrawlerProcess'' object has no attribute ''crawl''
Esta es la versión de scrapy 0.24.6. Todos los artículos y tuberías son correctos, porque la araña funciona desde la línea de comando.