Question

我有两个以相同格式提供输出的scraper：Link，Description。我可以让他们两个输出CSV文件。但是我不能让它在同一个CSV文件中输出数据，它只输出其中一个刮刀的数据。

我在导出数据时尝试使用'a'附加文件，但它只反映第一个scraper的数据。

我的问题是

如何以多种方式将数据添加到同一个CSV文件中以便在新行中添加数据？

CSV中我想要的结构示例：

HEADER: [Link, Description] 
ROW 1: [Link from scraper 1, Description from Scraper 1]
ROW 2: [Link from scraper 1, Description from Scraper 1]
ROW 3: [Link from scraper 2, Description from Scraper 2]

刮刀代码1

import csv ; import requests
from bs4 import BeautifulSoup

outfile = open('DeloitteImplementTest.csv','a')
writer = csv.writer(outfile)
writer.writerow(["job_link", "job_desc"])

res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")

for link in links:
        item_link = link.get("href").strip()
        item_text = link.text.replace("View Position","").encode('utf-8').strip()
        writer.writerow([item_link, item_text])
        print(item_link, item_text)

刮刀代码2

import csv ; import requests
from bs4 import BeautifulSoup

outfile = open('DeloitteImplementTest.csv','a')
writer = csv.writer(outfile)
writer.writerow(["job_link", "job_desc"])

res = requests.get("http://implementconsultinggroup.com/career/#/6257").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")

for li in soup.find('ul', class_='list-articles list').find_all('li'):
    level = li.find_all('dd', {'class': 'author'})[1].get_text()
    if "Graduate" in level:
        links = li.find_all(href=True)
        for link in links:
            if "career" in link.get("href") and 'COPENHAGEN' in link.text:
                item_link = link.get("href").strip()
                item_text = link.text.replace("View Position","").encode('utf-8').strip()
                writer.writerow([item_link, item_text])
                print(item_link, item_text)

已编辑的代码

#!/usr/bin/env python

import requests
from bs4 import BeautifulSoup
import csv
import os

class createCSV:
    def __init__(self, filename):
        try:
            self.csvFile = open(filename,'ab')
            headers = ['Link','Description']
            self.writer = csv.DictWriter(self.csvFile, delimiter='\t', fieldnames=headers)

            if os.stat(filename).st_size == 0:  # write header only once
                self.writer.writeheader()       
        except Exception, error:
            print error

    def write_row(self,link,desc):
        self.writer.writerow({'Link':link, 'Description':desc})

    def __del__(self):
        self.csvFile.close()

res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")

# here we create the "test.csv" which 
# we will use to append values to
outfile = createCSV('test10.csv')

for link in links:
        item_link = link.get("href").strip()
        item_text = link.text.replace("View Position","").encode('utf-8').strip()
        # append values to "test.csv"
        outfile.write_row(item_link, item_text)

# Remember that for the second scraper to write on the same .csv file 
# as the first one, you need to use the same 'createCSV' object - which 
# in this case is the "outfile".


res = requests.get("http://implementconsultinggroup.com/career/#/6257").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")

for li in soup.find('ul', class_='list-articles list').find_all('li'):
    level = li.find_all('dd', {'class': 'author'})[1].get_text()
    if "Graduate" in level:
        links = li.find_all(href=True)
        for link in links:
            if "career" in link.get("href") and 'COPENHAGEN' in link.text:
                item_link = link.get("href").strip()
                item_text = link.text.replace("View Position","").encode('utf-8').strip()
                # we use the same 'createCSV' object
                outfile.write_row(item_link, item_text)

最新代码

import requests
from bs4 import BeautifulSoup
import csv
import os

class createCSV:
    def __init__(self, filename):
        try:
            self.csvFile = open(filename,'ab')
            headers = ['Link','Description']
            self.writer = csv.DictWriter(self.csvFile, delimiter='\t', fieldnames=headers)

            if os.stat(filename).st_size == 0:  # write header only once
                self.writer.writeheader()       
        except Exception, error:
            print error
    def write_row(self,link,desc):
        self.writer.writerow({'Link':link, 'Description':desc})
    def __del__(self):
        self.csvFile.close()

res = requests.get("http://deloittedk.easycruit.com/?_sp=136ecff9b65625bf.1504382903200&icid=top_").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")
outfile = createCSV('TotalOutput2.csv')

for link in links:
        item_link = link.get("href").strip()
        item_text = link.text.replace("View Position","").encode('utf-8').strip()
        outfile.write_row(item_link, item_text)

res = requests.get("http://implementconsultinggroup.com/career/#/6257").text
soup = BeautifulSoup(res,"lxml")
links = soup.find_all("a")
outfile = createCSV('TotalOutput2.csv')

for li in soup.find('ul', class_='list-articles list').find_all('li'):
    level = li.find_all('dd', {'class': 'author'})[1].get_text()
    if "Graduate" in level:
        links = li.find_all(href=True)
        for link in links:
            if "career" in link.get("href") and 'COPENHAGEN' in link.text:
                item_link = link.get("href").strip()
                item_text = link.text.replace("View Position","").encode('utf-8').strip()
                outfile.write_row(item_link, item_text)

Answer 1

一个简单的解决方案是创建一个类来处理 csv - 附加进程。课程如下：

{{1}}

正如您所看到的那样，每当我们想要在csv文件中追加新项目时，我们都会使用stat.ST_SIZE变量来编写 csv-header 。< / p>

<强>更新

现在你可以将它与你的刮刀结合起来：

{{1}}

将多个scraper中的数据添加到具有相同标头的csv文件中

1 个答案: