我正在使用Python 3和最新的scrapy。当我通过另一个没有任何URL的回调函数传递一些CSV数据时。它显示 TypeError:请求URL必须为str或unicode,得到NoneType:
我的问题是如何在没有任何URL的情况下传递数据?
...
"extractor" :
{ "xml":
{
"rootNode": "CATALOG.CD",
"tagsAsAttribute": ["CATALOG.CD"]
}
},
...
答案 0 :(得分:0)
您已在此行中将“无”作为第一个参数传递。
yield scrapy.Request(None, meta=items, dont_filter=True, callback=self.parse_it)
在此处传递您的“ URL”!
答案 1 :(得分:0)
我遇到了类似的问题,并通过混合不同的解决方案解决了这个问题。您将需要一个自定义下载中间件,以防止请求进入下载器和您拥有的内容。
# -*- coding: utf-8 -*-
import scrapy
import csv
import json
class AppsSpider(scrapy.Spider):
custom_settings = {
"DOWNLOADER_MIDDLEWARES": {"yourproject.middlewares.AppsSpiderDownloaderMiddleware": 1},
}
def start_requests(self):
with open('data.csv', mode='r', encoding='utf-8') as file:
rows = csv.reader(file)
for row in rows:
url = row[0]
title = row[1]
developer = row[2]
price = row[3]
desc = row[4]
rating = row[5]
items = {
'url': url,
'title': title,
'developer': developer,
'price': price,
'desc': desc,
'rating': rating
}
# need protocol in url else scrapy will error
yield scrapy.Request(url="http://", meta=items, dont_filter=True, callback=self.parse_it)
def parse_it(self, response):
data = json.loads(response.text)
url = data['url']
title = data['title']
developer = data['developer']
price = data['price']
desc = data['desc']
rating = data['rating']
if int(rating) > 4:
parse_items = {
'url': url,
'title': title,
'developer': developer,
'price': price,
'desc': desc,
'rating': rating
}
yield parse_items
AppsSpiderDownloaderMiddleware.py
import json
from scrapy import signals
from scrapy.http import TextResponse
class AppsSpiderDownloaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return TextResponse(
url=request.url,
status=200,
body=json.dumps(request.meta).encode('UTF-8'),
request=request
)
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)