I have a spider, which has to find the «next» link —the one with the "»" inside— from this HTML:
<div id="content-center">
<div class="paginador">
<span class="current">01</span>
<a href="ml=0">02</a>
<a href="ml=0">03</a>
<a href="ml=0">04</a>
<a href="ml=0">»</a>
<a href="ml=0">Last</a>
</div>
</div>
I am trying with this spider
# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider
from scrapy.selector import Selector
from scrapy.http import Request
class YourCrawler(CrawlSpider):
name = "***"
start_urls = [
'http://www.***.com/10000000000177/',
]
allowed_domains = ["http://www.***.com/"]
def parse(self, response):
s = Selector(response)
page_list_urls = s.css('#content-center > div.listado_libros.gwe_libros > div > form > dl.dublincore > dd.title > a::attr(href)').extract()
for url in page_list_urls:
yield Request(response.urljoin(url), callback=self.parse_following_urls, dont_filter=True)
hxs = HtmlXPathSelector(response)
next_page = hxs.select(u"//*[@id='content-center']/div[@class='paginador']/a[text()='\u00bb']/@href").extract()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_following_urls(self, response):
for each_book in response.css('div#container'):
yield {
'title': each_book.css('div#content > div#primary > div > h1.title-book::text').extract(),
}
Does not recognize the link, any idea? Any Idea how to solve that?
Thanks!
答案 0 :(得分:1)
我认为BeautifulSoup
将完成这项工作
data = '''
<div class="pages">
<span class="current">01</span>
<a href="ml=0">02</a>
<a href="ml=0">03</a>
<a href="ml=0">04</a>
<a href="ml=0">05</a>
<a href="ml=0">06</a>
<a href="ml=0">07</a>
<a href="ml=0">08</a>
<a href="ml=0">09</a>
<a href="ml=0">10</a>
<a href="ml=0">»</a>
<a href="ml=0">Last</a>
</div>
from bs4 import BeautifulSoup
bsobj = BeautifulSoup(data, 'html.parser')
for a in bsobj.find_all('a'):
if a.text == '»':
print(a['href'])
答案 1 :(得分:1)
尝试使用\u
- 转义版»
:
>>> print(u'\u00bb')
»
在您的.xpath()
电话中(请注意字符串参数的u"..."
前缀)
hxs.select(u"//a[text()='\u00bb']/@href").extract()
您的spider .py文件可能正在使用UTF-8:
>>> u'\u00bb'.encode('utf-8')
'\xc2\xbb'
因此您也可以使用hxs.select(u"//a[text()='»']/@href").extract()
(u"..."
前缀仍然存在),但您还需要告诉Python您的.py
编码是什么。
通常在.py文件的顶部使用# -*- coding: utf-8 -*-
(或等效的)(例如第一行)。
答案 2 :(得分:0)
您可以在代码中更改一些内容:
应用这些要点:
# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
class YourCrawler(CrawlSpider):
name = "***"
start_urls = [
'http://www.***.com/10000000000177/',
]
allowed_domains = ["http://www.***.com/"]
def parse(self, response):
page_list_urls = response.css('#content-center > div.listado_libros.gwe_libros > div > form > dl.dublincore > dd.title > a::attr(href)').extract()
for url in page_list_urls:
yield Request(response.urljoin(url), callback=self.parse_following_urls, dont_filter=True)
next_page = response.xpath(u"//*[@id='content-center']/div[@class='paginador']/a[text()='\u00bb']/@href").extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_following_urls(self, response):
for each_book in response.css('div#container'):
yield {
'title': each_book.css('div#content > div#primary > div > h1.title-book::text').extract(),
}