我的代码执行以下操作:
我似乎成功地这样做了。但是,我输出有问题。输出正在写两个finviz信息,如P / E,marketcap,并输出新访问的信息,现在是集体finviz + yahoo(< - 我只想要后者)。我不知道它为什么输出两者,它会导致我的csv文件中出现大量重复内容。
class FinvizSpider(CrawlSpider):
name = "finviz"
allowed_domains = ["finviz.com", "finance.yahoo.com"]
start_urls = ["http://finviz.com/screener.ashx?v=152&f=cap_smallover&ft=4&c=0,1,2,6,7,10,11,13,14,45,65"]
rules = (Rule(LxmlLinkExtractor(allow=('r=\d+'),restrict_xpaths='//a[@class="tab-link"]')
, callback="parse_items", follow= True),
)
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
trs = hxs.select('//table[@bgcolor="#d3d3d3"]/tr');
items = []
for tr in trs[1:len(trs)]:
item = StockfundamentalsItem()
item['ticker'] = tr.select('td[2]/a/text()').extract()
item ["marketcap"] = tr.select("td[4]//text()").extract()
item ["pEarnings"] = tr.select("td[5]//text()").extract()
item ["pSales"] = tr.select("td[6]//text()").extract()
item ["pBook"] = tr.select("td[7]//text()").extract()
item ["pFCF"] = tr.select("td[8]//text()").extract()
item ["Div"] = tr.select("td[9]//text()").extract()
newurl = "http://finance.yahoo.com/q/ks?s=" + item['ticker'][0] + "+Key+Statistics"
newurl2 = "http://finance.yahoo.com/q/cf?s="+ item['ticker'][0] + "&ql=1"
yield Request(newurl, meta={'item': item}, callback=self.LinkParse)
yield Request(newurl2, meta={'item': item}, callback = self.LinkParse2)
items.append(item)
return items
def LinkParse(self, response):
hxs = HtmlXPathSelector(response)
enterprise = hxs.select('//table[@class="yfnc_datamodoutline1"]//tr[9]/td[2]/text()').extract()
item = response.meta['item']
item['Enterprise'] = [enterprise[0]]
return item
def LinkParse2(self, response):
hxs = HtmlXPathSelector(response)
stockpurchases = hxs.select('//table[@class="yfnc_tabledata1"]//tr[23]')
runningtot = 0
tds = (stockpurchases.select("./td/text()")).extract()
for elements in tds[1:]:
val = float(elements.strip().replace('-','0').replace(',','').replace('(','-').replace(')',''))
runningtot = runningtot + val
item = response.meta['item']
item['BBY'] = [runningtot]
return item
例如,我的输出看起来像这样(注意雅虎之前的信息,以及雅虎后的信息):
pFCF,pBook,pEarnings,BBY,Enterprise,marketcap,Div,ticker,pSales
14.44,3.24,33.45,,10.66,13.70B,0.98%,A,2.17
14.44,3.24,33.45,17000,10.66,13.70B,0.98%,A,2.17
.
.
.
甚至没有那个顺序。它非常混乱(我不介意)我只是不想要重复。
答案 0 :(得分:0)
乱搞时,我发现解决方案是在我的第二个请求中发布请求。
相当多:
class FinvizSpider(CrawlSpider):
name = "finviz"
allowed_domains = ["finviz.com", "finance.yahoo.com"]
start_urls = ["http://finviz.com/screener.ashx?v=152&f=cap_smallover&ft=4&c=0,1,2,6,7,10,11,13,14,45,65"]
rules = (Rule(LxmlLinkExtractor(allow=('r=\d+'),restrict_xpaths='//a[@class="tab-link"]')
, callback="parse_items", follow= True),
)
def parse_start_url(self, response):
return self.parse_items(response)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
trs = hxs.select('//table[@bgcolor="#d3d3d3"]/tr');
items = []
for tr in trs[1:len(trs)]:
item = StockfundamentalsItem()
item['ticker'] = tr.select('td[2]/a/text()').extract()
item ["marketcap"] = tr.select("td[4]//text()").extract()
item ["pEarnings"] = tr.select("td[5]//text()").extract()
item ["pSales"] = tr.select("td[6]//text()").extract()
item ["pBook"] = tr.select("td[7]//text()").extract()
item ["pFCF"] = tr.select("td[8]//text()").extract()
item ["Div"] = tr.select("td[9]//text()").extract()
newurl = "http://finance.yahoo.com/q/ks?s=" + item['ticker'][0] + "+Key+Statistics"
yield Request(newurl, meta={'item': item}, callback=self.LinkParse)
items.append(item)
return items
def LinkParse(self, response):
hxs = HtmlXPathSelector(response)
enterprise = hxs.select('//table[@class="yfnc_datamodoutline1"]//tr[9]/td[2]/text()').extract()
item = response.meta['item']
item['Enterprise'] = [enterprise[0]]
newurl2 = "http://finance.yahoo.com/q/cf?s="+ item['ticker'][0] + "&ql=1"
yield Request(newurl2, meta={'item': item}, callback = self.LinkParse2)
return
def LinkParse2(self, response):
hxs = HtmlXPathSelector(response)
stockpurchases = hxs.select('//table[@class="yfnc_tabledata1"]//tr[23]')
runningtot = 0
tds = (stockpurchases.select("./td/text()")).extract()
for elements in tds[1:]:
val = float(elements.strip().replace('-','0').replace(',','').replace('(','-').replace(')',''))
runningtot = runningtot + val
item = response.meta['item']
item['BBY'] = [runningtot]
return item
但是,这似乎不是解决此问题的正确方法......有没有办法正确执行多个请求?