我试图解析网络表并将某些数据导出到csv文件中。
我不知道要形成两个XPath,然后是一个for-statement(或者两个是正确的?)。
当前蜘蛛:
["column_account" => $this->account]
我知道这需要工作,我只是不确定要去哪个方向......?
这是输出:
可见,我无法填写日期。
以下是解析日期的html:
class MySpider(BaseSpider):
symbols = ["SCMP"]
name = "dozen"
allowed_domains = ["yahoo.com"]
start_urls = ["http://finance.yahoo.com/q/is?s=SCMP&annual"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
revenue = response.xpath('//td[@align="right"]/strong/text()')
date = response.xpath('//tr[@class="yfnc_modtitle1"]/th/text()')
items = []
for rev in revenue:
item = DozenItem()
item["Revenue"] = rev.re('\d*,\d*')
items.append(item)
return items[:3]
days = []
for day in dates:
item = DozenItem()
item["Date"] = day.re('\d*')
days.append(item)
return items[:3]
答案 0 :(得分:6)
for rev, day in zip(revenue, dates):
pass # code here
答案 1 :(得分:3)
class MySpider(BaseSpider):
symbols = ["SCMP"]
name = "dozen"
allowed_domains = ["yahoo.com"]
start_urls = ["http://finance.yahoo.com/q/is?s=SCMP&annual"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
revenue = response.xpath('//td[@align="right"]/strong/text()')
date = response.xpath('//tr[@class="yfnc_modtitle1"]/th/text()')
items = []
for rev, day in zip(revenue, dates):
item = DozenItem()
item["Revenue"] = rev.re('\d*,\d*')
item["Date"] = day.re('\d*')
items.append(item)
return items[:3]