在两个级别爬行网站并返回项目

时间:2015-03-12 15:05:53

标签: python callback web-crawler

我有一个主要页面来抓取名称和url.Again需要转到该网址并抓取更多详细信息,如全名,年龄和链接。最后需要在单个项目中返回带有(名称,网址,年龄,性别,链接)的项目。 想要在一个方法crawl_page中定义第一级爬网,在另一个方法crawl_item中定义第二级爬网。

class CrawlLink(CrawlSpider):
name = "crawllink"
allowed_domains = ['www.xyz.org']
start_urls = ["www.xyz.org/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(allow = ('/profile\?page=\d+'),restrict_xpaths = ('//li[@class="pager-next"]',),canonicalize=False ),
              callback = 'parse_page',
              follow=True)
         ]
def parse_page(self, response):
    self.log ('Started Crawling List %s' %response.url)
    items = response.xpath("//div[@id='profile']/div")
    ulists = []
    for temp in items:
        usritem = PostUsers()
        usrlink = temp.xpath("./div[@class='name']/a/@href").extract()[0]
        usritem ["url"] = 'www.xyz.org'+usrlink
    usritem ["namel"] = temp.xpath("//div[@id='user_profile_main']/dl/dd[1]/text()").extract()
        for urltemp in usrlink:
            yield Request(url=usritem["url"], callback=self.parse_user)
        # ulists.append( usritem)
    return ulists

def parse_user(self, response):
    self.log ('Started Crawling Profile %s' %response.url)
    usr = PostUsers()
    relative_url = response.xpath("//div[@id='nav-content']/ul/li[2]/a/@href").extract()[0]
    usr["link"] = 'www.xyz.org'+relative_url
    usr ["age"] = response.xpath("//div[@id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
    usr ["fullname"] =  response.xpath("//h1[@id='page-title']/text()").extract()
    self.log ('Finished Crawling Profile %s' %response.url)
    return usr

0 个答案:

没有答案