从可读性IXML中获取Xpath项

时间:2017-02-15 09:36:24

标签: python xpath

我正在使用python的Readbility-IXML模块,我有一个使用可读性的请求加载的文本,如下所示,并希望使用xpath获取其内部元素

<a class="media__link" href="/news/world-us-canada-38977644" rev="hero1|headline">
                                                            Republicans seek Flynn investigation                                                    </a>

基于docuemetation here

可以轻松加载

>> import requests
>> from readability import Document
>>
>> response = requests.get('http://example.com')
>> doc = Document(response.text)
>> doc.title()

但是如何获取scpecific元素或使用例如Xpath进行归因?有没有办法做到这一点

我的鳕鱼如下:

import datetime
import pymongo
import scrapy
from pip._vendor import requests
from readability import Document
from scrapy.http import Request
from scrapy.spiders import Spider
from io import StringIO
from lxml import etree


class TutsplusItem(scrapy.Item):
  title = scrapy.Field()



class MySpider(Spider):
  name = "tutsplus"
  allowed_domains   = ["bbc.com"]
  start_urls = ["http://www.bbc.com/"]
  crawling_level=None

  def __init__(self,crawling_level, *args):
      MySpider.crawling_level=crawling_level
      super(MySpider, self).__init__(self)



  def parse(self, response):
    links = response.xpath('//a/@href').extract()

    #print("Links are %s" %links)
    print ("Crawling level is %s " %MySpider.crawling_level )




    # We stored already crawled links in this list
    level=MySpider.crawling_level
    crawledLinks = []

    # Pattern to check proper link
    # I only want to get the tutorial posts
   # linkPattern = re.compile("^\/tutorials\?page=\d+")




    for link in links:
      # If it is a proper link and is not checked yet, yield it to the Spider
      #if linkPattern.match(link) and not link in crawledLinks:
      if not link in crawledLinks and level>0:
        link = "http://www.bbc.com" + link
        crawledLinks.append(link)
        yield Request(link, self.parse)

    titles = response.xpath("//a[@class='media__link']").extract()
    #titles = response.xpath('//a/@href').extract()
    print ("%d links was found" %len(titles))


    count=0
    for title in titles:
      item = TutsplusItem()
      item["title"] = title
      print("Title is : %s" %title)
      #yield item
      titleInner = Document(str(title))
      tree = etree.parse(StringIO(titleInner.get_clean_html()))
      link = tree.xpath("//a/@href")
      link = "http://www.bbc.com" + link
      response = requests.get(link)
      doc = Document(response.text)
      innerTree = etree.parse(StringIO(doc.get_clean_html()))

      title=innerTree.xpath("//title/text()")
      headline=innerTree.xpath("//p[@class='story-body__introduction']/text()")
      bodyText=innerTree.xpath("//div[class='story-body__inner']/text()")

我得到以下错误:

Traceback (most recent call last):
  File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\utils\defer.py", line 102, in iter_errback
    yield next(it)
  File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\Mehdi\PycharmProjects\WebCrawler\src\Crawler.py", line 69, in parse
    tree = etree.parse(StringIO(titleInner.get_clean_html()))
  File "c:\python27\lib\site-packages\readability\readability.py", line 143, in get_clean_html
    return clean_attributes(tounicode(self.html))
  File "src\lxml\lxml.etree.pyx", line 3397, in lxml.etree.tounicode (src\lxml\lxml.etree.c:80954)
TypeError: Type '<type 'NoneType'>' cannot be serialized.

1 个答案:

答案 0 :(得分:0)

对于Python 3. Python2需要不同的导入才能获得StringIOfrom StringIO import StringIO)。

from io import StringIO
from lxml import etree
tree = etree.parse(StringIO(doc.get_clean_html()))
tree.xpath("//title")