from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
for site in sites:
title = site.select('a/text()').extract()
link = site.select('a/@href').extract()
desc = site.select('text()').extract()
print title, link, desc
这是我的代码。我想要大量的URL来使用循环。那么我如何接受这些呢?我确实在那里放了多个网址,但我没有得到所有网址的输出。一些URL停止响应。那么如何使用此代码确保获取数据呢?
答案 0 :(得分:1)
您的代码看起来不错,但您确定start_urls
不应该以{{1}}
http://
<强> UPD 强>
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
是scrapy开头的网址列表。通常它有一个或两个链接。很少。
这些页面必须具有相同的HTML结构,因为Scrapy spider以相同的方式处理它们。
看看我是否在start_urls中添加了4-5个url,它为前2-3提供了输出ok 的URL。
我不相信这一点,因为scrapy不关心start_urls
列表中有多少链接。
但它停止响应并告诉我如何为此实现GUI。
Scrapy有debug shell来测试您的代码。
答案 1 :(得分:0)
您刚刚发布了tutorial的代码。您应该做的是实际读取整个文档,尤其是basic concept部分。您基本上想要的是crawl spider,您可以在其中定义蜘蛛将遵循的规则并使用您给定的代码进行处理。
使用示例引用文档:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
class MySpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
# Extract links matching 'item.php' and parse them with the spider's method parse_item
Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
hxs = HtmlXPathSelector(response)
item = Item()
item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = hxs.select('//td[@id="item_name"]/text()').extract()
item['description'] = hxs.select('//td[@id="item_description"]/text()').extract()
return item