我正在使用python的Readbility-IXML模块,我有一个使用可读性的请求加载的文本,如下所示,并希望使用xpath获取其内部元素
<a class="media__link" href="/news/world-us-canada-38977644" rev="hero1|headline">
Republicans seek Flynn investigation </a>
基于docuemetation here
可以轻松加载
>> import requests
>> from readability import Document
>>
>> response = requests.get('http://example.com')
>> doc = Document(response.text)
>> doc.title()
但是如何获取scpecific元素或使用例如Xpath进行归因?有没有办法做到这一点
我的鳕鱼如下:
import datetime
import pymongo
import scrapy
from pip._vendor import requests
from readability import Document
from scrapy.http import Request
from scrapy.spiders import Spider
from io import StringIO
from lxml import etree
class TutsplusItem(scrapy.Item):
title = scrapy.Field()
class MySpider(Spider):
name = "tutsplus"
allowed_domains = ["bbc.com"]
start_urls = ["http://www.bbc.com/"]
crawling_level=None
def __init__(self,crawling_level, *args):
MySpider.crawling_level=crawling_level
super(MySpider, self).__init__(self)
def parse(self, response):
links = response.xpath('//a/@href').extract()
#print("Links are %s" %links)
print ("Crawling level is %s " %MySpider.crawling_level )
# We stored already crawled links in this list
level=MySpider.crawling_level
crawledLinks = []
# Pattern to check proper link
# I only want to get the tutorial posts
# linkPattern = re.compile("^\/tutorials\?page=\d+")
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
#if linkPattern.match(link) and not link in crawledLinks:
if not link in crawledLinks and level>0:
link = "http://www.bbc.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
titles = response.xpath("//a[@class='media__link']").extract()
#titles = response.xpath('//a/@href').extract()
print ("%d links was found" %len(titles))
count=0
for title in titles:
item = TutsplusItem()
item["title"] = title
print("Title is : %s" %title)
#yield item
titleInner = Document(str(title))
tree = etree.parse(StringIO(titleInner.get_clean_html()))
link = tree.xpath("//a/@href")
link = "http://www.bbc.com" + link
response = requests.get(link)
doc = Document(response.text)
innerTree = etree.parse(StringIO(doc.get_clean_html()))
title=innerTree.xpath("//title/text()")
headline=innerTree.xpath("//p[@class='story-body__introduction']/text()")
bodyText=innerTree.xpath("//div[class='story-body__inner']/text()")
我得到以下错误:
Traceback (most recent call last):
File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "c:\python27\lib\site-packages\scrapy-1.3.1-py2.7.egg\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "C:\Users\Mehdi\PycharmProjects\WebCrawler\src\Crawler.py", line 69, in parse
tree = etree.parse(StringIO(titleInner.get_clean_html()))
File "c:\python27\lib\site-packages\readability\readability.py", line 143, in get_clean_html
return clean_attributes(tounicode(self.html))
File "src\lxml\lxml.etree.pyx", line 3397, in lxml.etree.tounicode (src\lxml\lxml.etree.c:80954)
TypeError: Type '<type 'NoneType'>' cannot be serialized.
答案 0 :(得分:0)
对于Python 3. Python2需要不同的导入才能获得StringIO
(from StringIO import StringIO
)。
from io import StringIO
from lxml import etree
tree = etree.parse(StringIO(doc.get_clean_html()))
tree.xpath("//title")