如何编写python scrapy代码以提取网站站点地图中存在的url并将其导出到csv

时间:2018-12-26 20:07:29

标签: python scrapy web-crawler sitemap

我找到了一种可行的解决方案,可以编写python scrapy代码来从here提取站点站点地图中的url,但不知道如何将数据导出到CSV文件!

当我尝试运行 scrapy crawl myspider -o mydata.csv 时,它返回一个空的csv文件,但URL列表正在屏幕上打印!

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests

class GetpagesfromsitemapSpider(SitemapSpider):
    name = "myspider"
    handle_httpstatus_list = [404]

    def parse(self, response):
       print(response.url)

    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                self.logger.info('Ignoring invalid sitemap: %s', response.url)
                return

            s = Sitemap(body)
            sites = []
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            sites.append(loc)
                            break
            print(sites)

    def __init__(self, spider=None, *a, **kw):
            super(GetpagesfromsitemapSpider, self).__init__(*a, **kw)
            self.spider = spider
            l = []
            url = "http://www.example.com/"
            resp = requests.head(url + "/sitemap.xml")
            if (resp.status_code != 404):
                l.append(resp.url)
            else:
                resp = requests.head(url + "/robots.txt")
                if (resp.status_code == 200):
                    l.append(resp.url)
            self.sitemap_urls = l
            print(self.sitemap_urls)

def iterloc(it, alt=False):
    for d in it:
        yield d['loc']

        # Also consider alternate URLs (xhtml:link rel="alternate")
        if alt and 'alternate' in d:
            for l in d['alternate']:
                yield l

1 个答案:

答案 0 :(得分:1)

首先,您不会提出任何要求,也要将scrapyrequests结合在一起,我认为这不是最好的主意。尝试将__init__更改为:

    def start_requests(self):
        l = []
        url = "http://www.example.com"
        l.append(url + '/sitemap.xml')
        l.append(url + '/robots.txt')
        for link in l:
            yield Request(link, callback=self._parse_sitemap)

此外,您的self._parse_sitemap 应该返回dict-likeRequest(不仅是您的self._parse_sitemap,而且您的拼凑蜘蛛程序中的每个函数,请参见{ {3}}):

def _parse_sitemap(self, response):

        # handle here status responses(200,401,etc)
        body = self._get_sitemap_body(response)
        if body is None:
            self.logger.info('Ignoring invalid sitemap: %s', response.url)
            return

        s = Sitemap(body)
        sites = {} # You should return a dict-like item!
        if s.type == 'sitemapindex':
            for loc in iterloc(s, self.sitemap_alternate_links):
                if any(x.search(loc) for x in self._follow):
                    yield Request(loc, callback=self._parse_sitemap)
        elif s.type == 'urlset':
            for loc in iterloc(s):
                for r, c in self._cbs:
                    if r.search(loc):
                        sites.append(loc)
                        break
        yield sites    # Change print to yield!, this is the way to populate your .csv file

整个文件(可能无法正常工作,但可以说明问题):

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import SitemapSpider
from scrapy.spiders import Spider
from scrapy.http import Request, XmlResponse
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
from scrapy.utils.gz import gunzip, is_gzipped
import re
import requests

class GetpagesfromsitemapSpider(SitemapSpider):
    name = "myspider"
    handle_httpstatus_list = [404]

    def parse(self, response):
       print(response.url)

    def _parse_sitemap(self, response):
        # handle here status responses(200,401,etc)
        body = self._get_sitemap_body(response)
        if body is None:
            self.logger.info('Ignoring invalid sitemap: %s', response.url)
            return

        s = Sitemap(body)
        sites = {} # You should return a dict-like item!
        if s.type == 'sitemapindex':
            for loc in iterloc(s, self.sitemap_alternate_links):
                if any(x.search(loc) for x in self._follow):
                    yield Request(loc, callback=self._parse_sitemap)
        elif s.type == 'urlset':
            for loc in iterloc(s):
                for r, c in self._cbs:
                    if r.search(loc):
                        sites.append(loc)
                        break
        yield sites    # Change print to yield!, this is the way to populate your .csv file

    def start_requests(self):
        l = []
        url = "http://www.example.com"
        l.append(url + '/sitemap.xml')
        l.append(url + '/robots.txt')
        for link in l:
            yield Request(link, callback=self._parse_sitemap)

def iterloc(it, alt=False):
    for d in it:
        yield d['loc']

        # Also consider alternate URLs (xhtml:link rel="alternate")
        if alt and 'alternate' in d:
            for l in d['alternate']:
                yield l