我试图抓取所有网页中的所有数据。当我尝试加入网址我不能。我想知道我在做什么错误
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
import urlparse
from data.items import TextPostItem
from scrapy import optional_features
optional_features.remove('boto')
class RedditCrawler(CrawlSpider):
name = 'reddit_crawler'
allowed_domains = ['yellowpages.com']
start_urls = ['http://www.yellowpages.com/search?search_terms=restaurant&geo_location_terms=California%2C%20KY']
custom_settings = {
'BOT_NAME': 'reddit-scraper',
'DEPTH_LIMIT': 7,
'DOWNLOAD_DELAY': 3
}
def parse(self, response):
s = Selector(response)
next_link = s.xpath('//a[@class="next ajax-page"]/@href').extract()[0]
full_link = urlparse.urljoin('http://www.yellowpages.com',next_link)
yield self.make_requests_from_url(full_link)
posts = Selector(response).xpath('//div[@class="search-results organic"]')
for post in posts:
item = TextPostItem()
item['address']= post.xpath("//p[@class='adr']//text()").extract()
item['business_name']= post.xpath("//a[@class='business-name']//text()").extract()
item['phonenumber']= post.xpath("//div[@class='phones phone primary']//text()").extract()
item['categories']=post.xpath("//div[@class='categories']//text()").extract()
item['next_link']=post.xpath("//div[@class='pagination']//a[@class='next ajax-page']//@href").extract()
yield item
答案 0 :(得分:0)
我认为你的xpath '//a[@class="next ajax-page"]/@href'
是不正确的。它对我不起作用。
尝试更简单的{{1}}