from scrapy.spiders import Spider
from tutorial.items import Table
from scrapy.selector import Selector
class GoalSpider(Spider):
name = "stats"
allowed_domains = ["numberfire.com"]
start_urls = ('http://www.numberfire.com/nba/fantasy/full-fantasy-basketball-projections', )
def parse(self, response):
hxs = Selector(response)
items = hxs.xpath('//td/text()')
with open("stats.txt", 'wb') as f:
for item in items:
f.write(item.extract() + ' || ')
for item in items:
my_item = Table()
my_item['tal'] = item.select('.//text()').extract()
my_item['sep'] = item.select('.//text()').extract()
yield my_item
您好,所以上面的代码假设要进入网站并逐行拉取表数据并将其作为纯文本写入文件。不幸的是我无法弄清楚为什么,但是,当我使用爬虫时,值只是:
${opponent.abbrev} || #${opponent_rank} || ${minutes} || ${pts} || ${fgm}-${fga} || ${ftm}-${fta} || ${p3m}-${p3a} || ${treb} || ${ast} || ${stl} || ${blk} || ${tov} || ${pf} || ${fp} || $${salary} || ${ratio} ||
我相信我正确地将它分开但我不明白为什么爬虫正在拉这样的表数据,这些是正确的行等等但它不是实际的数据本身。看起来好像被拉的数据是表的代码,而不是实际的数据。