我想使用递归来抓取网站中的所有链接。并解析所有链接页面,以提取链接页面中的所有详细链接。 如果页面链接与规则相混淆,则页面链接是我想要解析详细信息的项目。 我使用下面的代码:
class DmovieSpider(BaseSpider):
name = "dmovie"
allowed_domains = ["movie.douban.com"]
start_urls = ['http://movie.douban.com/']
def parse(self, response):
item = DmovieItem()
hxl = HtmlXPathSelector(response)
urls = hxl.select("//a/@href").extract()
all_this_urls = []
for url in urls:
if re.search("movie.douban.com/subject/\d+/$",url):
yield Request(url=url, cookies = cookies ,callback=self.parse_detail)
elif ("movie.douban.com" in url) and ("movie.douban.com/people" not in url) and ("movie.douban.com/celebrity" not in url) and ("comment" not in url):
if ("update" not in url) and ("add" not in url) and ("trailer" not in url) and ("cinema" not in url) and (not redis_conn.sismember("crawledurls", url)):
all_this_urls.append(Request(url=url, cookies = cookies , callback=self.parse))
redis_conn.sadd("crawledurls",response.url)
for i in all_this_urls:
yield i
def parse_detail(self, response):
hxl = HtmlXPathSelector(response)
title = hxl.select("//span[@property='v:itemreviewed']/text()").extract()
title = select_first(title)
img = hxl.select("//div[@class='grid-16-8 clearfix']//a[@class='nbgnbg']/img/@src").extract()
img = select_first(img)
info = hxl.select("//div[@class='grid-16-8 clearfix']//div[@id='info']")
director = info.select("//a[@rel='v:directedBy']/text()").extract()
director = select_first(director)
actors = info.select("//a[@rel='v:starring']/text()").extract()
m_type = info.select("//span[@property='v:genre']/text()").extract()
release_date = info.select("//span[@property='v:initialReleaseDate']/text()").extract()
release_date = select_first(release_date)
d_rate = info.select("//strong[@class='ll rating_num']/text()").extract()
d_rate = select_first(d_rate)
info = select_first(info)
post = hxl.select("//div[@class='grid-16-8 clearfix']//div[@class='related-info']/div[@id='link-report']").extract()
post = select_first(post)
movie_db = Movie()
movie_db.name = title.encode("utf-8")
movie_db.dis_time = release_date.encode("utf-8")
movie_db.description = post.encode("utf-8")
movie_db.actors = "::".join(actors).encode("utf-8")
movie_db.director = director.encode("utf-8")
movie_db.mtype = "::".join(m_type).encode("utf-8")
movie_db.origin = "movie.douban.com"
movie_db.d_rate = d_rate.encode("utf-8")
exist_item = Movie.where(origin_url=response.url).select().fetchone()
if not exist_item:
movie_db.origin_url = response.url
movie_db.save()
print "successed!!!!!!!!!!!!!!!!!!!!!!!!!!!"
urls是页面中的所有链接。如果其中一个网址是我要解析的详细信息页面,则产生一个请求,其回调方法是parse_detail。否则产生一个请求回调方法解析。
通过这种方式,我抓了一些页面,但似乎页面没有满,根据我的结果,似乎没有访问某些页面。你能告诉我怎么样?有没有办法正确地抓取所有页面?
答案 0 :(得分:1)
答案 1 :(得分:0)
class DmovieSpider(BaseSpider):
name = "dmovie"
allowed_domains = ["movie.douban.com"]
start_urls = ['http://movie.douban.com/']
def parse(self, response):
req = []
hxl = HtmlXPathSelector(response)
urls = hxl.select("//a/@href")
for url in urls:
r = Request(url, callback=self.parse_detail)
req.append(r)
return req
def parse_detail(self, response):
hxl = HtmlXPathSelector(response)
title = hxl.select("//span[@property='v:itemreviewed']/text()").extract()
item = DmovieItem()
item['title'] = title[0].strip()
return item