如何使用hxs.select在scrapy中获取整个文档

时间:2012-11-17 22:55:58

标签: xpath scrapy

我已经在这里工作了12个小时,我希望有人可以给我一条腿。

这是我的代码,我想要的是获取页面上每个链接的锚点和网址。

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from urlparse import urljoin

#from scrapy.item import Item
from tutorial.items import DmozItem

class HopitaloneSpider(CrawlSpider):
name = 'dmoz'
allowed_domains = ['domain.co.uk']
start_urls = [
    'http://www.domain.co.uk'
]

rules = (
    #Rule(SgmlLinkExtractor(allow='>example\.org', )),
    Rule(SgmlLinkExtractor(allow=('\w+$', )), callback='parse_item', follow=True),
)

user_agent = 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))'

def parse_item(self, response):
    #self.log('Hi, this is an item page! %s' % response.url)

    hxs = HtmlXPathSelector(response)
    #print response.url
    sites = hxs.select('//html')
    #item = DmozItem()
    items = []

    for site in sites: 

                   item = DmozItem()
                   item['title'] = site.select('a/text()').extract()
                   item['link'] = site.select('a/@href').extract()

                   items.append(item)

    return items

我做错了什么......我的眼睛现在受伤了。

2 个答案:

答案 0 :(得分:3)

response.body应该是你想要的

def parse_item(self, response):
    #self.log('Hi, this is an item page! %s' % response.url)

    body = response.body
    item = ....

答案 1 :(得分:1)

要获取单个页面上的所有链接:

def parse_item(self, response):
  hxs = HtmlXPathSelector(response)
  items = []
  links = hxs.select("//a")

  for link in links: 

                 item = DmozItem()
                 item['title'] = site.select('text()').extract()
                 item['link'] = site.select('@href').extract()

                 items.append(item)

return items