我刚开始使用Scrapy: 以下是我要抓取的网站示例:
http://www.thefreedictionary.com/shame
我蜘蛛的代码:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from dic_crawler.items import DicCrawlerItem
from urlBuilder import *
class Dic_crawler(BaseSpider):
name = "dic"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = listmaker()[:]
print start_urls
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[@id="MainTxt"]/table/tbody')
print 'SITES:\n',sites
item = DicCrawlerItem()
item["meanings"] = sites.select('//*[@id="MainTxt"]/table/tbody/tr/td/div[1]/div[1]/div[1]/text()').extract()
print item
return item
listmaker()返回要废弃的网址列表。
我的问题是,如果我在xpath中选择'tbody'并返回一个空的 sites 变量, sites 变量将为空,如果我只选择表格,我会得到我想要的网站部分。
由于 tbody 之后的部分未选择超出 tbody
同时,在此期间,该网站提供了多种含义,我想提取但我只知道如何提取单一方法。
由于
答案 0 :(得分:1)
这是一个让你开始的蜘蛛骨架:
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
class Dic_crawler(BaseSpider):
name = "thefreedictionary"
allowed_domains = ["www.thefreedictionary.com"]
start_urls = ['http://www.thefreedictionary.com/shame']
def parse(self, response):
hxs = HtmlXPathSelector(response)
# loop on each "noun" or "verb" or something... section
for category in hxs.select('id("MainTxt")//div[@class="pseg"]'):
# this is simply to get what's in the <i> tag
category_name = u''.join(category.select('./i/text()').extract())
self.log("category: %s" % category_name)
# for each category, a term can have multiple definition
# category from .select() is a selector
# so you can call .select() on it also,
# here with a relative XPath expression selecting all definitions
for definition in category.select('div[@class="ds-list"]'):
definition_text = u'\n'.join(
definition.select('.//text()').extract())
self.log(" - definition: %s" % definition_text)