Scrapy请求没有抓取所有URL返回

时间:2019-04-21 14:00:46

标签: python scrapy

最近有一个针对越南地区抓取Google Play商店应用的项目,并意识到该请求不会针对尚未返回的所有URL运行回调函数。

from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy


class GooglePlayStoreSpider(CrawlSpider):
    name = 'google_play'
    allowed_domains = ['play.google.com']
    start_urls = ['http://play.google.com']

    rules = (
        Rule(LinkExtractor(allow=('https://play.google.com/store/apps/details')), follow=True,
             process_links='process_links',
             callback='parse_1'),
    )

    crawled_ids = []
    first_init = False

    def parse_start_url(self, response):
        # print("-------------- PRINTING SECTION START_URL --------------")
        if not self.first_init:
            self.first_init = True
            extractor = LinkExtractor(allow=('/store/apps/category/.*',))
            raw_links = extractor.extract_links(response)
            links = self.process_links(raw_links)
            return [
                scrapy.Request('{}'.format(link.url))
                for link in links
            ]
        else:
            # print("============ START_URL ELSE PART ============")
            pass

    def process_links(self, links):
        new_links = []
        for link in links:
            old_url = link.url

            if not old_url.startswith('https://play.google.com/store/apps/'):
                continue

            old_url_obj = urlparse(old_url)
            old_url_query = dict(parse_qsl(old_url_obj.query))

            if old_url_obj.path == '/store/apps/details':
                if old_url_query['id'] in self.crawled_ids:
                    continue
                else:
                    self.crawled_ids.append(old_url_query['id'])
            old_url_query['hl'] = 'en'
            old_url_query['gl'] = 'vn'
            link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
                                             urlencode(old_url_query))
            new_links.append(link)

        # print("LINKKSSS ====", links)
        # print("NEW_LINKKSSS ====", new_links)
        # print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
        return new_links

    def parse_1(self, response):
        selector = scrapy.Selector(response)

        urls = selector.xpath('//a[@class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb  id-track-click "]/@href').extract()

        links = []
        for url in urls:
            if not url.startswith('https://play.google.com/'):
                url = "https://play.google.com" + url
            links.append(url)

        link_flag = 0

        for url in urls:
            # yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
            yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
            link_flag += 1


    def parse_next(self, response):
        # print("PARSE_NEXT ===========", response.request.url)
        selector = scrapy.Selector(response)

        app_urls = selector.xpath('//div[@class="details"]/a[@class="title"]/@href').extract()

        urls = []
        for url in app_urls:
            url = "https://play.google.com" + url + '&hl=en&gl=vn'
            urls.append(url)

        url_list = []

        link_flag = 0

        for url in app_urls:
            yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
            link_flag += 1
        # return url_list

    def parse_detail(self, response):

        print("Parsed ======= ", response.request.url)

        item = dict()
        item['name'] = response.xpath('//div[@itemscope]//meta[@itemprop="name"]/@content').extract_first()
        item['category'] = response.xpath(
            '//div[@itemscope]//meta[@itemprop="applicationCategory"]/@content').extract_first()
        item['review_score'] = response.xpath(
            '//div[@itemscope]//meta[@itemprop="ratingValue"]/@content').extract_first()
        item['review_count'] = response.xpath(
            '//div[@itemscope]//meta[@itemprop="reviewCount"]/@content').extract_first()
        item['link'] = response.request.url
        item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
        item['content_rating'] = response.xpath(
            '//div[@itemscope]//meta[@itemprop="contentRating"]/@content').extract_first()
        item['image'] = response.xpath('//div[@itemscope]//meta[@itemprop="image"]/@content').extract_first()
        item['price'] = response.xpath('//div[@itemscope]//meta[@itemprop="price"]/@content').extract_first()
        item['price_currency'] = response.xpath(
            '//div[@itemscope]//meta[@itemprop="priceCurrency"]/@content').extract_first()
        # item['operating_system'] = response.xpath('//div[@itemscope]//meta[@itemprop="operatingSystem"]/@content').extract_first()
        return item

当我碰到终端时,它说它抓取了100页,只抓取了15页(数字仅供参考)。 请帮助

0 个答案:

没有答案