from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from sample3.items import taamaaItem
class taamaaSpider(BaseSpider):
name = "taamaa"
allowed_domains = ["taamaa.com"]
start_urls = [
"http://www.taamaa.com/store-directory/"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div/div[@class="section clearfix col-md-12"]')
items = []
list1 = []
list2 = []
for site in sites:
list1 = sites[0].xpath('//div[@class="pull-left col-md-3 merchant"]/div[@class="name"]/a/text()').extract()
list2 = sites[0].xpath('//div[@class="pull-left col-md-3 merchant"]/div[@class="url"]/a/text()').extract()
for index in range(len(list2)):
td = taamaaItem()
td['name'] = list1[index]
td['link'] = list2[index]
items.append(td)
return items
在提取数据时,它会留下空白值并获取下一个链接值,从而无法修正我的数据对齐。
实施例 如果 A = a,B =,C = c,D = d,E = e
它获取输出 A = a,B = c,C = d,D = e,E = a
我希望输出像这样
A = a,B =,C = c,D = d,E = e
我怎样才能做到这一点。
答案 0 :(得分:1)
我看到两件奇怪的事情:
sites[0]
对于使用一些空文本元素对2个列表进行分组的问题,您可以在sites
上使用相同的结构,但在每次迭代中提取name
和link
,
所以你不需要中间名单
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from sample3.items import taamaaItem
class taamaaSpider(BaseSpider):
name = "taamaa"
allowed_domains = ["taamaa.com"]
start_urls = [
"http://www.taamaa.com/store-directory/"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div/div[@class="section clearfix col-md-12"]')
items = []
for site in sites:
td = taamaaItem()
td['name'] = site.xpath("""
.//div[@class="pull-left col-md-3 merchant"]
/div[@class="name"]/a/text()""").extract()
td['link'] = site.xpath("""
.//div[@class="pull-left col-md-3 merchant"]
/div[@class="url"]/a/text()""").extract()
items.append(td)
return items
了解我如何使用相对XPath表达式(.//div......
)