我是新手。我有一个蜘蛛,它根据输入文本文件(criteria.txt)的文本进行Yahoo搜索,并返回大约10个URL的列表。我希望创建html,以将url作为超链接,提取标题和第一段。我正在尝试使用xpath来提高速度,但是它不起作用。我尝试了许多xpath语句变体。无效代码被注释掉。我可以在beautifulsoup上使用它,但是我需要草率的版本。任何帮助表示赞赏。
代码如下:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest
from Yahoosearch.items import YahoosearchItem
import time, datetime, csv, random, base64, re
class SpiderSpider(scrapy.Spider):
name = 'spider'
allowed_domains = ['yahoo.com']
start_urls = ['http://yahoo.com/']
handle_httpstatus_list = [404]
search_word = ""
page_count = 1
def start_requests(self):
fin = open("/home/gi/output/criteria.txt")
self.search_word = fin.readline()
url = "https://uk.search.yahoo.com/search?p={}&fr=yfp-t&fp=1&toggle=1&cop=mss&ei=UTF-8".format(self.search_word)
req = Request(url=url, callback=self.get_data, dont_filter=True)
yield req
def get_data(self, response):
print "------ Get Url of " + str(self.page_count) + " Page ------"
item = YahoosearchItem()
if response.xpath('//h1[contains(text(), "Not Found")]'):
print " === The data was not found on this server. ==="
print " Please check the Date!!! "
return
try:
lists = response.xpath('//ol[@class="mb-15 reg searchCenterMiddle"]/li')
for cnt, element in enumerate(lists):
link = element.xpath('.//h3[@class="title"]/a/@href').extract_first()
item["url"] = link
linkx = "<p><A href='" + link + "'>" + link + "</A></p>"
# title = element.xpath('.//h1[@class="title"]').extract_first()
# print "title = ", title
# item["title"] = title
desc = element.xpath('.//p').extract_first()
yield item
with open('result.html', 'a') as f:
f.write('{}\n'.format(linkx))
# f.write(title)
# f.write(desc)
self.page_count += 1
if self.page_count == 2: # number of pages of 10 urls
return
nLink = response.xpath('//a[@class="next"]/@href').extract_first()
if nLink:
req = Request(url=nLink, callback=self.get_data, dont_filter=True)
yield req
except Exception as e:
print e