首先,我是Python的新手。我正在尝试将多个数据合并为一个CSV。以下是CSV格式,
file1.csv
Country of Residence,2014-04,2015-04
NORTH AMERICA ,"5,514","6,160"
Canada ,"2,417","2,864"
U.S.A. ,"3,097","3,296"
LATIN AMERICA & THE CARIBBEAN ,281,293
WESTERN EUROPE ,"37,369","34,964"
Austria ,893,666
Belgium ,867,995
file2.csv
Country of Residence,2014-11,2015-11
LATIN AMERICA & THE CARIBBEAN ,373,418
Argentina ,47,50
Brazil ,68,122
Chille ,24,30
Colombia ,31,25
Others ,203,191
WESTERN EUROPE-OTHERS ,1330,1367
Croatia ,77,72
Greece ,408,452
Ireland ,428,343
Finland ,149,178
Portugal ,211,261
Others ,57,61
在最后的csv中,我希望有一个唯一的标题列表,
Country of Residence,2014-04,2015-04,2014-05,2015-05,..2014-11,2014-11
NORTH AMERICA ,"5,514","6,160",NaN,Nan,...
Portugal, Nan,Nan,Nan,Nan,.....,211,261
此外,我希望国家/地区列表是唯一的,因此我可以通过阅读csv列表填写数字。
在下面的代码中,我获得了唯一的列标题,但我不知道如何使Country列唯一,并根据一年中的国家/地区和月份添加一个数字。
非常感谢任何帮助。
for filename in glob.iglob(os.path.join('/Documents/stats/csv','*.csv')):
with open(filename,'rb') as f:
csvIn = csv.reader(f)
hdr = csvIn.next()
hdr[0] = hdr[0].replace('\xef\xbb\xbf','')
hdrList.append((len(hdr),hdr))
hdrList.sort()
hdrs = []
template = []
for t in hdrList:
for f in t[1]:
print(f)
if
if not (f in hdrs):
hdrs.append(f)
template.append('')
答案 0 :(得分:0)
此代码可让您走上正轨。注意:它是为Python 3编写的。
import glob
import os
import csv
class CountryData:
"""Data for one country for one period of residence."""
def __init__(self, val1, val2):
# XXX: What do these values represent?
self.val1 = val1
self.val2 = val2
class ResidenceData:
"""Data for one period of residence."""
def __init__(self):
self.start_date = ""
self.end_date = ""
self.countries = {}
residence_data_list = []
countries = set()
for filename in glob.iglob(os.path.join('/Documents/stats/csv','*.csv')):
residence_data = ResidenceData()
residence_data_list.append(residence_data)
with open(filename,'r') as f:
csvIn = csv.reader(f)
for hdr in csvIn:
hdr[0] = hdr[0].replace('\xef\xbb\xbf','')
if hdr[0] == 'Country of Residence':
residence_data.start_date = hdr[1]
residence_data.end_date = hdr[2]
else:
country, val1, val2 = hdr
country = country.strip()
country_data = CountryData(val1, val2)
residence_data.countries[country] = country_data
countries.add(country)
print("Country of Residence", end="")
for data in residence_data_list:
print(",", end="")
print(",".join([data.start_date, data.end_date]), end="")
print()
for country in sorted(countries):
print(country, end="")
for data in residence_data_list:
print(",", end="")
if country in data.countries:
country_data = data.countries[country]
print(",".join([country_data.val1, country_data.val2]), end="")
else:
print("NaN,NaN", end="")
print()
结果:
Country of Residence,2014-04,2015-04,2014-11,2015-11
Argentina,NaN,NaN,47,50
Austria,893,666,NaN,NaN
Belgium,867,995,NaN,NaN
Brazil,NaN,NaN,68,122
Canada,2,417,2,864,NaN,NaN
Chille,NaN,NaN,24,30
Colombia,NaN,NaN,31,25
Croatia,NaN,NaN,77,72
Finland,NaN,NaN,149,178
Greece,NaN,NaN,408,452
Ireland,NaN,NaN,428,343
LATIN AMERICA & THE CARIBBEAN,281,293,373,418
NORTH AMERICA,5,514,6,160,NaN,NaN
Others,NaN,NaN,57,61
Portugal,NaN,NaN,211,261
U.S.A.,3,097,3,296,NaN,NaN
WESTERN EUROPE,37,369,34,964,NaN,NaN
WESTERN EUROPE-OTHERS,NaN,NaN,1330,1367
答案 1 :(得分:0)
如果你不关心底层逻辑,你可以使用Pandas来做到这一点:
import pandas as pd
file_list = [file1, file2]
dfs = []
for file in file_list:
dfs.append(pd.read_csv(filepath_or_buffer=file, sep=',', index_col=0))
result_df = pd.concat(dfs, axis=1)
result_df.index.name = 'Country of Residence'
result_df.to_csv('result.csv')