Scrapy MultiCSVItemPipeline导出一些空项

时间:2018-04-11 19:14:40

标签: python python-2.7 csv scrapy export-to-csv

我有多个具有不同项目的蜘蛛,我想将每个项目导出到不同的csv文件中。我使用了How can scrapy export items to separate csv files per item中的代码示例,但是存在问题。

现在我的蜘蛛只会写"页面"项目。所有项目都填入shell中,但文件仍为空。我调试了管道,但到目前为止我还没有发现错误。

这是我的蜘蛛:

import csv

import scrapy
from BeautifulSoup import BeautifulSoup
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.spiders import Rule

from DataSpiders import CSV_PATH
from ScrapingItems import TrierDeItem
from SuperSpider import SuperSpider

HTML_PATH = 'pages/trier.de/'


class TrierDeSpider(scrapy.Spider, SuperSpider):
    name = 'trierDeSpider'

    allowed_domains = ['trier.de']
    denied_domains = []
    start_urls = [
        'https://www.trier.de/rathaus-buerger-in/trier-in-zahlen/',
        'https://trier.de/startseite/',
        'https://www.trier.de/leben-in-trier/',
        'https://www.trier.de/kultur-freizeit/',
        'https://www.trier.de/wirtschaft-arbeit/',
        'https://www.trier.de/bildung-wissenschaft/',
        'https://www.trier.de/bauen-wohnen/',
        'https://www.trier.de/umwelt-verkehr/',
    ]
    # Set starting point for the spider and starts crawling from start_urls
    rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse', follow=True),)

    def parse(self, response):
        """
        Parse for Links Page Body. Follow allowed Domains by adding them to the request. Parse the current page with
            callback and the method parse_page.
        :param response:
        :return:
        """
        for link in LxmlLinkExtractor(allow=self.allowed_domains, deny=self.denied_domains).extract_links(response):
            yield scrapy.Request(response.urljoin(link.url), callback=self.parse_page)

    def parse_page(self, response):
        """
        Parse the current page for information.
        :param response: 
        :return: 
        """
        trier_de_item = TrierDeItem()
        yield self.parse_general_page_info(response, HTML_PATH)
        # extract the page url
        trier_de_item["url"] = response.url
        # extract the crawling datetime
        trier_de_item["crawling_date_time"] = response.headers['Date']
        # extract page title
        trier_de_item["title"] = response.css('title::text').extract()
        # extract description tags
        trier_de_item["description"] = response.xpath('//meta[@name="description"]/@content').extract()
        trier_de_item["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
        # extract all page headers
        trier_de_item["news_title"] = response.xpath('//div[@class="dachzeile"]/text()').extract()
        # extract topic
        trier_de_item["topic"] = response.xpath('//div[@class="topic"]/text()').extract()
        # extract headlines
        trier_de_item['headlines'] = response.xpath('//h1/text()').extract()

        # check if page contains a table
        table = response.xpath('//table[@class="datentabelle"]').extract()
        if len(table) > 0:
            self.parse_table(response.body, trier_de_item['headlines'][0])
        yield trier_de_item

    @staticmethod
    def parse_table(body_html, title):
        '''
        Parse HTML Page with table and save to csv file
        :param body_html:
        :param title:
        :return:
        '''
        title = title.replace('/', '')
        try:
            # Create Filename from title
            filename = title + '.csv'
            soup = BeautifulSoup(body_html)
            soup.prettify('utf-8')
            content = []
            # find all tables in html
            tables = soup.findAll('table')
            for table in tables:
                # find reach table row
                for row in table.findAll('tr'):
                    # extract each table header and row and extract text to line from each row
                    line = []
                    for header in row.findAll('th'):
                        if ' ' in header.text:
                            line.append('')
                        else:
                            line.append(header.text)
                    for row in row.findAll('td'):
                        if ' ' in row.text:
                            line.append('')
                        else:
                            line.append(row.text)
                    content.append(line)
            # Open a new csv file an write each line to the file
            with open(CSV_PATH + filename, 'wb') as csv_file:
                wr = csv.writer(csv_file)
                for line in content:
                    wr.writerow(line)
        except Exception as e:
            print(e)
            pass

SuperSpider:

import urlparse

from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor

from DataSpiders import write_html
from DataSpiders.ScrapingItems import PageItem, BaseItem

ALLOWED_FILE_TYPES = ('.pdf', '.csv', '.xls', '.xlsx')


class SuperSpider:
    def __init__(self):
        pass

    def url_join(self, urls, response):
        '''
        Join URL with response
        :param urls:
        :param response:
        :return:
        '''
        joined_urls = []
        for url in urls:
            joined_urls.append(response.urljoin(url))

        return joined_urls

    def parse_general_page_info(self, response, HTML_PATH):
        page_item = PageItem()
        page_item["url"] = response.url
        # extract respones body
        if 'jsp' in response.url:
            url = response.url.split('.jsp')
            write_html(url[0], response.body, HTML_PATH)
        elif '?' in response.url:
            url = response.url.split('?')
            write_html(url[0], response.body, HTML_PATH)
        else:
            write_html(response.url, response.body, HTML_PATH)
        # Search for files that contain any allowed file type
        found_files = []
        domain = response.url.split('/')[2]
        for a in response.xpath('//a[@href]/@href'):
            link = a.extract()
            if link.endswith(ALLOWED_FILE_TYPES):
                link = urlparse.urljoin(domain, link)
                found_files.append(link)
        # extract all refering links
        extractor = LxmlLinkExtractor()
        linklist = []
        for link in extractor.extract_links(response):
            # extract links which contain a file in url and add those to 'found_files' for downloading
            if '?imgUid' in link.url:
                fullpath = link.url
                path = fullpath.split('.de')[1]
                found_files.append(urlparse.urljoin(domain, path))
            else:
                linklist.append(link.url)
        page_item["links"] = linklist
        # add all files to lokaloItem
        page_item["file_urls"] = self.url_join(found_files, response)
        # extract page title
        page_item["title"] = response.css('title::text').extract()

        # extract all image urls
        relative_img_urls = response.css("img::attr(src)").extract()
        page_item["image_urls"] = self.url_join(relative_img_urls, response)

        return page_item

    def parse_base_page_information(self, response):
        baseItem = BaseItem()
        baseItem["url"] = response.url
        # extract page title
        baseItem["title"] = response.css('title::text').extract()
        baseItem["crawling_date_time"] = response.headers['Date']
        # extract description tags
        baseItem["description"] = response.xpath('//meta[@name="description"]/@content').extract()
        baseItem["og_description"] = response.xpath('//meta[@name="og:description"]/@content').extract()
        baseItem['headlines'] = response.xpath('//h1/text()').extract()
        return baseItem

ScrapingItems:

from scrapy import Item, Field


class PageItem(Item):
    url = Field()
    title = Field()
    image_urls = Field()
    file_urls = Field()
    links = Field()


class BaseItem(Item):
    url = Field()
    title = Field()
    crawling_date_time = Field()
    description = Field()
    og_description = Field()
    headlines = Field()


class TrierDeItem(BaseItem):
    news_title = Field()
    tag = Field()
    topic = Field()

Multi CSV Pipeline:

class MultiCSVItemPipeline(object):
    CSVPath = "csv_data/"
    SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']

    def __init__(self):
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        self.files = dict([(name, open(self.CSVPath + name + '.csv', 'ab')) for name in self.SaveTypes])
        self.exporters = dict([(name, CsvItemExporter(self.files[name])) for name in self.SaveTypes])
        [e.start_exporting() for e in self.exporters.values()]

    def spider_closed(self, spider):
        [e.finish_exporting() for e in self.exporters.values()]
        [f.close() for f in self.files.values()]

    def process_item(self, item, spider):
        what = item_type(item)
        if what in set(self.SaveTypes):
            self.exporters[what].export_item(item)
        return item


def item_type(item):
    '''
    Returns the scraping item name
    :param item:
    :return:
    '''
    return type(item).__name__.replace('Item', '').lower()

我现在还没有找到解决方案,但我尝试了几件失败的事情。

  • 产品列表,不适用于scrapy
  • 只生成一个项目,并为page_item和trier_item
  • 创建两个解析方法
  • 删除所有 SaveTypes 但是' trierde'。蜘蛛没有写任何东西

所以,关于这些选项,我试过,我相信,管道本身存在一些错误...... 我感谢任何人提供的任何帮助。

其他信息: 在将管道更改为MultiCSV之前,我能够将每个项目保存到csv。

1 个答案:

答案 0 :(得分:1)

在我无法解决Scrapy导出器的问题后,我决定创建自己的导出器。

以下是希望将多个不同的项目导出到一个或多个蜘蛛中的不同csv文件的所有人的代码。到目前为止,它对我有用,但我仍在检查错误代码。如果你有一些改进的想法,请随时回复。

class MultiCSVItemPipeline(object):
    # Subfolder path, where the csv files are stored
    CSVPath = "csv_data/"
    # All allowed items
    SaveTypes = ['page', 'base', 'trierde', 'triermitgestalten', 'teleport', 'lokalocomment', 'lokalo', 'lokalonews']
    # List for already checked csv headers
    CheckedHeaders = []

    def __init__(self):
        import sys
        reload(sys)
        sys.setdefaultencoding('utf8')
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_opened(self, spider):
        # Check if items exists and create new ones if not
        for file in set(self.SaveTypes):
            f = open(self.CSVPath + file + '.csv', 'a+')
            f.close()

    def spider_closed(self, spider):
        #  not needed anymore
        # [e.finish_exporting() for e in self.exporters.values()]
        # [f.close() for f in self.files.values()]
        pass

    def process_item(self, item, spider):
        what = item_type(item)
        if what in set(self.SaveTypes):
            try:
                # Check if csv file contains header, but only those, that aren't checked
                if what not in self.CheckedHeaders:
                    self.check_header(what, item)
                self.write_item_to_row(item, what)
            except Exception as e:
                logging.error("########################################################")
                logging.error("Error writing to " + what + ".csv file ")
                logging.error("Error Message: " + e.message)
                logging.error("Error Reason: " + e.reason)
                logging.error("Error Object: " + e.object)
                logging.error("########################################################")
        return item

    def write_item_to_row(self, item, what):
        """
        Write a single item to a row in csv file
        :param item:
        :param what:
        :return:
        """
        ofile = open(self.CSVPath + what + '.csv', "ab")
        writer = csv.writer(ofile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        item_dict = item.__dict__['_values']
        row = []
        for k in item_dict:
            d = item_dict[k]
            # Ig item is not a list join the element to string, replace all delimiters and set encoding to utf-8
            if not isinstance(d, types.ListType):
                value = ''.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
            else:
                value = ','.join(item_dict[k]).replace('\t', '').replace('\n', '').encode('utf8')
            row.append(value)
        writer.writerow(row)
        ofile.close()

    def check_header(self, what, item):
        """
        Check if the file contains header elements and create if missing
        :param what:
        :param item:
        :return:
        """
        try:
            with open(self.CSVPath + what + '.csv', 'ab+') as csvfile:
                writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
                item_dict = item.__dict__['_values']
                # If file is empty, create new csv header
                if os.stat(self.CSVPath + what + '.csv').st_size == 0:
                    self.write_csv_header(item_dict, writer)
                else:
                    # Read first row and check header elements
                    read_csv = csv.reader(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
                    first_row = read_csv.next()
                    # if not all headers are set in the csv file, print warning
                    if not self.check_key_in_csv_header(item_dict, first_row):
                        # TODO: Add missing header to the csv file
                        logging.warning("Wrong headers for file " + what + ".csv")
                self.CheckedHeaders.append(what)
                csvfile.close()
                return True
        except Exception as e:
            logging.error(e.message)
            return False

    @staticmethod
    def write_csv_header(item_dict, writer):
        """
        Write header of a csv file.
        Header is writen from each keys in the scrapy item
        :param item_dict:
        :param writer:
        :return:
        """
        first_row = []
        for k in item_dict:
            # Join each Key to a string, delete delimiters and encode to utf-8
            value = ''.join(k).replace('\t', '').replace('\n', '').encode('utf8')
            first_row.append(value)
        writer.writerow(first_row)

    @staticmethod
    def check_key_in_csv_header(item_dict, row):
        """
        Check, for each item key, if it's contained in the first line of the csv
        k (key) stands for each dictionary key of the scrapy item.
        :param item_dict:
        :param row:
        :return:
        """
        for k in item_dict:
            if k not in row:
                return False
        return True