如何通过scrapy抓取所有网站

时间:2015-12-31 06:49:45

标签: python scrapy

# -*- coding: utf-8 -*-

__author__ = 'onecue'

import scrapy

from community.items import CommunityItem
from datetime import datetime
import re

class CommunitySpider(scrapy.Spider):
    name = "communityCrawler"

    start_urls = []

    def start_requests(self):
        for i in range(1, 2, 1):
            yield scrapy.Request("http://www.clien.net/cs2/bbs/board.php?bo_table=park&page=%d" % i, self.parse_clien)
            yield scrapy.Request("http://www.bobaedream.co.kr/list?code=freeb&page=%d" % i, self.parse_bobae)

    def parse_clien(self, response):
        for sel in response.xpath('//tbody/tr[@class="mytr"]'):
            item = CommunityItem()
            item['Title'] = sel.xpath('td[@class="post_subject"]/a/text()').extract()[0]
            dateTmp = datetime.strptime(sel.xpath('td/span/@title').extract()[0], "%Y-%m-%d %H:%M:%S")
            td = sel.xpath('td')
            item['Description'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S")

            yield item

    def parse_bobae(self, response):
        for sel in response.xpath('//tbody/tr[@itemtype="http://schema.org/Article"]'):
            item = CommunityItem()

            date_now = datetime.now()

            date_str_tmp = sel.xpath('td[@class="date"]/text()').extract()[0]

            prog = re.compile('[0-9]{2}:[0-9]{2}')
            if prog.match(date_str_tmp):
                date_str = date_now.strftime('%y/%m/%d') +' ' + date_str_tmp + ':00'
            else:
                date_str = date_now.strftime('%y/') + date_str_tmp +' ' + '00:00:00'

            dateTmp = datetime.strptime(date_str,"%y/%m/%d %H:%M:%S")

            item['Title'] = sel.xpath('td[@class="pl14"]/a/text()').extract()[0]
            item['Description'] = dateTmp.strftime("%Y-%m-%d %H:%M:%S")

            yield item

我创建了社区抓取工具,但它只是按照我制定的规则抓取页面。 我想要抓取爬虫,抓取所有网站,如谷歌的网络蜘蛛。 我该怎么做?

1 个答案:

答案 0 :(得分:0)

参见下面的代码并尝试理解这个概念。请注意,此代码未经过测试,因此您需要特别检查x路径,但希望这可以让您了解其工作原理。你不能在第一次去谷歌方式 - Scrapy非常好,但不是这样:)请阅读更多文档,它就在那里,需要一点耐心。

# start just one website at a time to get the concept
start_urls = ["http://www.clien.net/cs2/bbs/board.php?bo_table=park&page=%d"]


# def parse(self, response) should be the first method you call
# to get response from  the link defined in start_urls

def parse(self, response):
    # here you look for links on the website defined in your start_urls
    for sel in xpath('//td[@class=""post_subject"]/a/@href)'):
        links = response.urljoin(sel.extract()[0])
        yield Scrapy.Request(self, callback=self.parsenextlevel)  


def parsenextlevel(self, response)) 
    # here you parse items from the links that were retrieved in parse method, 
    # so e.g. links in http://www.clien.net/cs2/bbs/board.php?bo_table=park&wr_id=30438421
    # or in http://www.clien.net/cs2/bbs/board.php?bo_table=park&wr_id=1068932.     
    # If you need to dig deeper you need to keep nesting the requests in further callbacks
    item = CommunityItem()
    item['Title'] = sel.xpath("//div[@class="view_title"]/h4/span/text()").extract()
    yield item

修改 调用parsenextlevel回调的更好方法是将其称为parseitem,因为这是示例中所做的,但是你自己定义了这个名称。