我是python编程和使用scrapy的新手。我已经设置了我的爬虫,到目前为止它一直在工作,直到我想要弄清楚如何下载图像。我得到的错误是无法导入名称NsiscrapePipeline。我不知道我做错了什么,我不了解一些文件,因为我是新的。请帮忙
项目文件
from scrapy.item import Item, Field
class NsiscrapeItem(Item):
# define the fields for your item here like:
# name = Field()
location = Field()
stock_number = Field()
year = Field()
manufacturer = Field()
model = Field()
length = Field()
price = Field()
status = Field()
url = Field()
pass
蜘蛛
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from NSIscrape.items import NsiscrapeItem
from scrapy.http import Request
from scrapy.contrib.pipeline.images import NsiscrapePipeline
import Image
class NsiscrapeSpider(BaseSpider):
name = "Nsiscrape"
allowed_domain = ["yachtauctions.com"]
start_urls = [
"http://www.yachtauctions.com/inventory/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//tr')
items = []
for site in sites:
item = NsiscrapeItem()
item['location'] = site.select('td[2]/text()').extract()
item['stock_number'] = site.select('td[3]/a/text()').extract()
item['year'] = site.select('td[4]/text()').extract()
item['manufacturer'] = site.select('td[5]/text()').extract()
item['model'] = site.select('td[6]/text()').extract()
item['length'] = site.select('td[7]/text()').extract()
item['price'] = site.select('td[8]/text()').extract()
item['status'] = site.select('td[10]/img/@src').extract()
item['url'] = site.select('td[1]/a/@href').extract()
item['image_urls'] = site.select('td/a[3]/img/@data-original').extract()
item['images'] = item['image_urls']
yield Request(item['url'][0], meta={'item':item}, callback=self.product_detail_page)
def product_detail_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.request.meta['item']
#add all images url in the item['image_urls']
yield item
设置
ITEM_PIPELINES = ['scrapy.contrib.pipeline.image.NsiscrapePipeline']
IMAGES_STORE = 'c:\Python27\NSIscrape\IMG'
IMAGES_EXPIRES = 90
管道这是我不确定我是否遗漏某些东西的地方
from scrapy.item import Item
class NsiscrapePipeline(Item):
image_urls = Field()
images = Field()
def process_item(self, item, spider):
return item
错误
File "NSIscrape\spiders\NSI_Spider.py", line 9, in <module>
from scrapy.contrib.pipeline.images import NsiscrapePipeline
ImportError: cannot import name NsiscrapePipeline
答案 0 :(得分:0)
这不是图书馆的一部分:) - 至少通过查看他们当前的master branch
我认为你正在寻找ImagesPipeline
他们的例子可能有所帮助! example
P.S。我不认为你自定义这个类的名字 - 至少不是因为scapy的设计;我有理由相信你会上课;)
答案 1 :(得分:0)
您尝试传递列表,但此函数仅接受字符串。仅从列表中传递一个元素(例如list [0])。
答案 2 :(得分:0)
这是我的最终代码。有两个问题
1:我错过了请求中的第二个反斜杠 - &gt; // TD [1] / [3] / IMG / @数据原始
2:我必须检查图像显示的完整URL并将它们连接在一起,这是主URL或允许的URL和图像URL。
def parse(self, response):
hxs = HtmlXPathSelector(response)
images = hxs.select('//tr')
url = []
for image in images:
urls = NsiscrapeItem()
urls['image_urls'] = ["http://www.yachtauctions.com" + x for x in image.select('//td[1]/a[3]/img/@data-original').extract()]
url.append(urls)
return url