抓取工具只返回目标网址元素的最后一个值。 但是我想要获得每个元素值的列表并将它们弹出到项目中。
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from scrapy_practice.items import ScrapyPracticeItem
class DmozSpider(scrapy.Spider):
name = 'dmoz'
allowed_domains = ['www.dmoz.org']
start_urls = (
'http://www.dmoz.org/Computers/Programming/Languages/Python/Books/',
)
def parse(self, response):
bs = BeautifulSoup(response.body, 'lxml')
sp_item = ScrapyPracticeItem()
items = []
for item in bs.find_all(class_='title-and-desc'):
sp_item['name'] = item.a.div.string
sp_item['url'] = item.a['href']
items.append(sp_item)
print(items)
答案 0 :(得分:0)
因为您创建了一个项目并继续覆盖它的字段。
def parse(self, response):
#<...>
sp_item = ScrapyPracticeItem() # <- this is the culprit
items = []
for item in bs.find_all(class_='title-and-desc'):
# you want to create item object for every item you find
sp_item = ScrapyPracticeItem() # <- it should be here
sp_item['name'] = item.a.div.string
sp_item['url'] = item.a['href']
items.append(sp_item)