python scrapy spider extract网址,标题,返回的每个网址的第一段

时间:2018-07-24 14:36:41

标签: python xpath scrapy

我是新手。我有一个蜘蛛,它根据输入文本文件(criteria.txt)的文本进行Yahoo搜索,并返回大约10个URL的列表。我希望创建html,以将url作为超链接,提取标题和第一段。我正在尝试使用xpath来提高速度,但是它不起作用。我尝试了许多xpath语句变体。无效代码被注释掉。我可以在beautifulsoup上使用它,但是我需要草率的版本。任何帮助表示赞赏。

代码如下:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request, FormRequest
from Yahoosearch.items import YahoosearchItem
import time, datetime, csv, random, base64, re

class SpiderSpider(scrapy.Spider):

name = 'spider'
allowed_domains = ['yahoo.com']
start_urls = ['http://yahoo.com/']
handle_httpstatus_list = [404]
search_word = ""
page_count = 1

def start_requests(self):

fin = open("/home/gi/output/criteria.txt")

self.search_word = fin.readline()
    url = "https://uk.search.yahoo.com/search?p={}&fr=yfp-t&fp=1&toggle=1&cop=mss&ei=UTF-8".format(self.search_word)
    req = Request(url=url, callback=self.get_data, dont_filter=True)
    yield req 

def get_data(self, response):

    print "------ Get Url of " + str(self.page_count) + " Page ------"

    item = YahoosearchItem()

    if response.xpath('//h1[contains(text(), "Not Found")]'):
        print " === The data was not found on this server. ==="
        print " Please check the Date!!! "
        return

    try:

        lists = response.xpath('//ol[@class="mb-15 reg searchCenterMiddle"]/li')


        for cnt, element in enumerate(lists):

            link = element.xpath('.//h3[@class="title"]/a/@href').extract_first()

            item["url"] = link

            linkx = "<p><A href='" + link + "'>" + link + "</A></p>"

        #    title = element.xpath('.//h1[@class="title"]').extract_first()

        #    print "title = ", title

        #    item["title"] = title
            desc = element.xpath('.//p').extract_first()

            yield item

            with open('result.html', 'a') as f:
              f.write('{}\n'.format(linkx))
        #      f.write(title)
        #      f.write(desc)

        self.page_count += 1

        if self.page_count == 2:        # number of pages of 10 urls
            return

        nLink = response.xpath('//a[@class="next"]/@href').extract_first()

        if nLink:
            req = Request(url=nLink, callback=self.get_data, dont_filter=True)
            yield req

    except Exception as e:
        print e

0 个答案:

没有答案