我已经在这里工作了12个小时,我希望有人可以给我一条腿。
这是我的代码,我想要的是获取页面上每个链接的锚点和网址。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.url import urljoin_rfc
from scrapy.utils.response import get_base_url
from urlparse import urljoin
#from scrapy.item import Item
from tutorial.items import DmozItem
class HopitaloneSpider(CrawlSpider):
name = 'dmoz'
allowed_domains = ['domain.co.uk']
start_urls = [
'http://www.domain.co.uk'
]
rules = (
#Rule(SgmlLinkExtractor(allow='>example\.org', )),
Rule(SgmlLinkExtractor(allow=('\w+$', )), callback='parse_item', follow=True),
)
user_agent = 'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))'
def parse_item(self, response):
#self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
#print response.url
sites = hxs.select('//html')
#item = DmozItem()
items = []
for site in sites:
item = DmozItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/@href').extract()
items.append(item)
return items
我做错了什么......我的眼睛现在受伤了。
答案 0 :(得分:3)
response.body应该是你想要的
def parse_item(self, response):
#self.log('Hi, this is an item page! %s' % response.url)
body = response.body
item = ....
答案 1 :(得分:1)
要获取单个页面上的所有链接:
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
links = hxs.select("//a")
for link in links:
item = DmozItem()
item['title'] = site.select('text()').extract()
item['link'] = site.select('@href').extract()
items.append(item)
return items