Scrapy Pipeline - CSV输出 - 输出网页网址

时间:2016-08-18 09:49:53

标签: csv scrapy pipeline

我正在使用管道将scrapy项目中的数据导出到csv文件中。每行从单个scrapy对象获取数据,加上提取日期。

我想知道如何在每行末尾添加提取项目的URL。

在这里你可以看到我的pypeline的代码

def open_spider(self, spider):
    if spider.name == "LandSalesLinks":
        self.file = csv.writer(open('links.csv', 'w'), quoting=csv.QUOTE_NONE)
    elif spider.name == "LandSalesDetails":
        now = datetime.now()
        filepath = u"E:/Dropbox/Αγορά Ακινήτων/Πωλήσεις Γης/Αρχεία προς ένταξη/"
        filename = str(now.year)+"-"+str(now.month)+"-"+str(now.day)+" "+'details.csv'
        self.file = csv.writer(open(filepath+filename, 'w'), delimiter=';', quoting=csv.QUOTE_NONE)
        row = [
            "extraction_date",
            "regionA",
            "regionB",
            "regionC",
            "regionD",
            "location_name",
            "category",
            "area",
            "price",
            "city_plan",
            "structure_factor",
            "coverage_factor",
            "facade_length",
            "facade_count",
            "airy",
            "slope",
            "artio",
            "oikodomisimo",
            "me_adia",
            "ktizei",
            "availability",
            "availability_from",
            "antiparoxi",
            "view",
            "dist_from_sea",
            "paling",
            "supplies",
            "drilling",
            "with_building",
            "corner_plot",
            "mesites",
            "epaggelmatiki_xrisi",
            "dimensions",
            "contains"
        ]
        self.file.writerow(row)

def process_item(self, item, spider):
    if spider.name == "LandSalesLinks":
        # Declaring an empty list that represents a row of the table
        row = []
        # First and only column
        row.append("http://www.xe.gr"+str(item['link'][0])+"?mode=spec")
        # Writing the row to the file
        self.file.writerow(row)
        return item

    elif spider.name == "LandSalesDetails":

        def append2(row, item):
            if item != []:
                row.append(item[0].encode('utf-8').strip())
            else:
                row.append("")
        # Declaring an empty list that represents a row of the table
        row = []
        now = datetime.now()
        row.append(str(now.day)+"/"+str(now.month)+"/"+str(now.year))
        # append2(row, item['region']) <-- old
        # Seperation region to regionA, regionB, regionC and regionD
        region = item['region'][0]
        reglist = region.split(" > ")
        # print reglist[0].strip()
        # print reglist[1].strip()
        # print reglist[2].strip()
        # print reglist[3].strip()
        # s = input("stop")
        # Region A
        row.append(reglist[0].strip().encode('utf-8'))
        # Region B
        try:
            row.append(reglist[1].strip().encode('utf-8'))
        except IndexError:
            row.append("")
        # Region C
        try:
            row.append(reglist[2].strip().encode('utf-8'))
        except IndexError:
            row.append("")
        # Region D
        try:
            row.append(reglist[3].strip().encode('utf-8'))
        except IndexError:
            row.append("")
        append2(row, item['location_name'])
        append2(row, item['category'])
        append2(row, item['area'])
        append2(row, item['price'])
        append2(row, item['city_plan'])
        append2(row, item['structure_factor'])
        append2(row, item['coverage_factor'])
        append2(row, item['facade_length'])
        append2(row, item['facade_count'])
        append2(row, item['airy'])
        append2(row, item['slope'])
        append2(row, item['artio'])
        append2(row, item['oikodomisimo'])
        append2(row, item['me_adia'])
        append2(row, item['ktizei'])
        append2(row, item['availability'])
        append2(row, item['availability_from'])
        append2(row, item['antiparoxi'])
        append2(row, item['view'])
        append2(row, item['dist_from_sea'])
        append2(row, item['paling'])
        append2(row, item['supplies'])
        append2(row, item['drilling'])
        append2(row, item['with_building'])
        append2(row, item['corner_plot'])
        append2(row, item['mesites'])
        append2(row, item['epaggelmatiki_xrisi'])
        append2(row, item['dimensions'])
        append2(row, item['contains'])
        # Writing the row to the file
        self.file.writerow(row)
        return item

1 个答案:

答案 0 :(得分:0)

您只需在项目中添加def parse(self, response): item = MyItem() item['url'] = response.url yield item 字段,然后在管道中添加使用它:

def process_item(self, item, spider):
    row = []
    row.append(item['url'])
    # and then remove it if you don't want it in your item
    del item['url']
    return item

并在管道中:

const arr = [Object.keys(result[0])]
  .concat(result.map(({x, y, name}) => [x, y, name]))