:)
我正在浏览Selenium Webdriver。 如果不使用产量,它可以正常工作,但是 使用yield时,会发生以下错误。 我该怎么办?
Traceback (most recent call last):
File "c:\programdata\anaconda2\lib\site-packages\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "c:\programdata\anaconda2\lib\site-packages\scrapy\core\spidermw.py", line 49, in process_spider_input
return scrape_func(response, request, spider)
File "c:\programdata\anaconda2\lib\site-packages\scrapy\core\scraper.py", line 146, in call_spider
dfd.addCallbacks(request.callback or spider.parse, request.errback)
File "c:\programdata\anaconda2\lib\site-packages\twisted\internet\defer.py", line 303, in addCallbacks
assert callable(callback)
这是代码:
# -*- coding: utf-8 -*-
workbook = xlsxwriter.Workbook('arrays.xlsx')
worksheet = workbook.add_worksheet()
class LgSpider(Spider):
name = 'lg'
allowed_domains = ['naturecollection.co.kr/product/list.jsp?cate_seq=4']
def start_requests(self):
reader = csv.reader(open('urls1.csv'))
for row in reader:
url = row[0]
# self.parse_detail(url)
yield Request(url=url, callback=self.parse_detail(url))
def parse_detail(self, url):
self.driver = webdriver.Chrome('/webdrivers/chromedriver')
self.driver.get(url)
sleep(10)
sel = Selector(text=self.driver.page_source)
# sleep(2)
# self.logger.info('Sleeping for 2 seconds.')
response = url
sub_kor = sel.xpath('//meta[@property="og:title"]/@content').extract()
sub_en = sel.xpath('//*[@class="section fr"]//*[@class="subTit"]/text()').extract()
highlight_1 = sel.xpath('//meta[@property="og:description"]/@content').extract()
main = sel.xpath('//meta[@property="og:image"]/@content').extract()
category_1 = sel.xpath('//*[@id="locationArea"]/div/a/text()').extract()
category_2 = sel.xpath('//*[@id="locationArea"]/strong/text()').extract()
# table = sel.xpath('//*[@id="specInfoLayer"]//td').extract()
noop_originpirce = sel.xpath('//*[@class="section fr"]//*[@class="realCost"]/text()').extract()
noop_real_price = sel.xpath('//*[@class="section fr"]//span[@class="cost"]/text()').extract()
real_price = sel.xpath('//*[@class="colorChip optionList"]//input[@name="cost"]/@value').extract()
stock_no = sel.xpath('//*[@class="colorChip optionList"]//*[contains(@id, "stock")]/@value').extract()
options = sel.xpath('//*[@class="colorChip optionList"]//@title').extract()
brand = sel.xpath('//span[@class="brand"]/text()').extract_first()
rating = sel.xpath('//*[@class="starArea"]/span/text()').extract()
description = sel.xpath('//*[@id="proExplain"]//p').extract()
image_urls = sel.xpath('//*[@class="thumList"]/li/a/img/@src').extract()
volume = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[1]/td/text()').extract()
skin_type = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[2]').extract()
expire_date = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[3]').extract()
method = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[4]').extract()
manufature = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[5]').extract()
ingridient = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[6]').extract()
for idx, option in enumerate(options):
yield {'Option': options[idx],
# 'A': a,
'Volume': volume,
'Skin_type': skin_type,
'Expire_date': expire_date,
'Method': method,
'Manufature': manufature,
'Url': url,
'Sub_kor': sub_kor,
'Sub_en': sub_en,
'Highlight': highlight_1,
'Noop_Origin_price': noop_originpirce,
'Noop_real_price': noop_real_price,
'Real_price': real_price[idx],
'Category_1': category_1,
'Category_2': category_2,
# 'Category_3': category_3,
# 'Category_4': category_4,
'Stock_no': stock_no,
'Description': description,
'Rating': rating,
'Ingridient': ingridient,
'Brand': brand,
# 'Ingridient_text': ingridient_text,
'Image_urls': image_urls,
# 'Table_dts': table_dts,
# 'Table_dds': table_dds,
# 'Options': options[idx],
# 'Brand': brand,
# 'Table' : table,
# 'Buyer_no': buyer_no,
# 'Repurchase' : repurchase,
'Main': main
}
def close(self, reason):
# pass
csv_file = max(glob.iglob('*.csv'), key=os.path.getctime)
wb = Workbook()
ws = wb.active
with open(csv_file, 'r') as f:
for row in csv.reader(f):
# row = row.encode('utf-8')
try:
ws.append(row)
except:
continue
wb.save(csv_file.replace('.csv', '') + '.xlsx')enter code here
谢谢!!!!
答案 0 :(得分:0)
我非常确定当您使用yield
时,您需要遍历您创建的生成器。因此,generator
将是start_requests
,这会让您的读者产生每一行。
因此,您需要存储您的生成器然后迭代它。请参阅python yield
- What does the "yield" keyword do?
答案 1 :(得分:0)
错误告诉您回调有问题
你应该删除回调中的2º url调用,你的代码会是这样的
yield Request(url=url, callback=self.parse_detail)
此外,我将使用响应对象而不是选择器。
您的代码将如下所示:
def parse_detail(self,response)
sub_kor = response.xpath('//meta....')