我有一个主要页面来抓取名称和url.Again需要转到该网址并抓取更多详细信息,如全名,年龄和链接。最后需要在单个项目中返回带有(名称,网址,年龄,性别,链接)的项目。 想要在一个方法crawl_page中定义第一级爬网,在另一个方法crawl_item中定义第二级爬网。
class CrawlLink(CrawlSpider):
name = "crawllink"
allowed_domains = ['www.xyz.org']
start_urls = ["www.xyz.org/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(allow = ('/profile\?page=\d+'),restrict_xpaths = ('//li[@class="pager-next"]',),canonicalize=False ),
callback = 'parse_page',
follow=True)
]
def parse_page(self, response):
self.log ('Started Crawling List %s' %response.url)
items = response.xpath("//div[@id='profile']/div")
ulists = []
for temp in items:
usritem = PostUsers()
usrlink = temp.xpath("./div[@class='name']/a/@href").extract()[0]
usritem ["url"] = 'www.xyz.org'+usrlink
usritem ["namel"] = temp.xpath("//div[@id='user_profile_main']/dl/dd[1]/text()").extract()
for urltemp in usrlink:
yield Request(url=usritem["url"], callback=self.parse_user)
# ulists.append( usritem)
return ulists
def parse_user(self, response):
self.log ('Started Crawling Profile %s' %response.url)
usr = PostUsers()
relative_url = response.xpath("//div[@id='nav-content']/ul/li[2]/a/@href").extract()[0]
usr["link"] = 'www.xyz.org'+relative_url
usr ["age"] = response.xpath("//div[@id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
usr ["fullname"] = response.xpath("//h1[@id='page-title']/text()").extract()
self.log ('Finished Crawling Profile %s' %response.url)
return usr