处理分割数据帧时的Pandas KeyError

时间:2017-03-04 05:38:04

标签: python pandas

我想对分成块的pandas数据帧执行一些操作。在分割数据帧之后,我尝试迭代块,但是在第一次迭代运行良好之后,我得到一个错误(见下文)。我已经完成了一些类似的问题:12但他们并没有完全解决我的问题。请帮助我解决这个问题,因为我并不完全理解它。

import pandas as pd

tupList = [('Eisenstadt', 'Paris','1', '2'), ('London', 'Berlin','1','3'), ('Berlin', 'stuttgat','1', '4'),
           ('Liverpool', 'Southampton','1', '5'),('Tirana', 'Blackpool', '1', '6'),('blackpool', 'tirana','1','7'),
           ('Paris', 'Lyon','1','8'), ('Manchester', 'Nice','1','10'),('Orleans', 'Madrid','1', '12'),
           ('Lisbon','Stockholm','1','12')]


cities = pd.DataFrame(tupList, columns=['Origin', 'Destination', 'O_Code', 'D_code'])


# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def splitDataFrameIntoSmaller(df, chunkSize = 3):
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf

citiesChunks = splitDataFrameIntoSmaller(cities)

for ind, cc in enumerate(citiesChunks):
    cc["distance"] = 0
    cc["time"] = 0

    for i in xrange(len(cc)):
        al = cc['Origin'][i]
        bl = cc['Destination'][i]
        '...' #trucating to make it readable

    cc.to_csv('out.csv', sep=',', encoding='utf-8')


Traceback (most recent call last):
  File ..., line 39, in <module>
    al = cc['Origin'][i]
  File ..., line 603, in __getitem__
    result = self.index.get_value(self, key)
  File ..., line 2169, in get_value
    tz=getattr(series.dtype, 'tz', None))
  File "pandas\index.pyx", line 98, in pandas.index.IndexEngine.get_value (pandas\index.c:3557)
  File "pandas\index.pyx", line 106, in pandas.index.IndexEngine.get_value (pandas\index.c:3240)
  File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:4279)
  File "pandas\src\hashtable_class_helper.pxi", line 404, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:8564)
  File "pandas\src\hashtable_class_helper.pxi", line 410, in pandas.hashtable.Int64HashTable.get_item (pandas\hashtable.c:8508)
KeyError: 0L

1 个答案:

答案 0 :(得分:1)

您可以在第一层划分索引值,然后使用list comprehension - 循环unique值并按loc选择,最后reset_index删除重复索引:

cities.index = cities.index // 3
print (cities)
       Origin  Destination O_Code D_code
0  Eisenstadt        Paris      1      2
0      London       Berlin      1      3
0      Berlin     stuttgat      1      4
1   Liverpool  Southampton      1      5
1      Tirana    Blackpool      1      6
1   blackpool       tirana      1      7
2       Paris         Lyon      1      8
2  Manchester         Nice      1     10
2     Orleans       Madrid      1     12
3      Lisbon    Stockholm      1     12

citiesChunks = [cities.loc[[x]].reset_index(drop=True) for x in cities.index.unique()]
#print (citiesChunks)

print (citiesChunks[0])
       Origin Destination O_Code D_code
0  Eisenstadt       Paris      1      2
1      London      Berlin      1      3
2      Berlin    stuttgat      1      4

如果在DataFrame中需要循环,则最后需要iterrows

#write columns to file first
cols = ['Origin', 'Destination', 'O_Code', 'D_code', 'distance', 'time']
df = pd.DataFrame(columns=cols)
df.to_csv('out.csv', encoding='utf-8', index=False)

for ind, cc in enumerate(citiesChunks):
    cc["distance"] = 0
    cc["time"] = 0

    for i, val in cc.iterrows():
        al = cc.loc[i, 'Origin']
        bl = cc.loc[i, 'Destination']
        '...' #trucating to make it readable

    cc.to_csv('out.csv', encoding='utf-8', mode='a', header=None, index=False)
    print (cc.to_csv(encoding='utf-8'))

,Origin,Destination,O_Code,D_code,distance,time
0,Eisenstadt,Paris,1,2,0,0
1,London,Berlin,1,3,0,0
2,Berlin,stuttgat,1,4,0,0

,Origin,Destination,O_Code,D_code,distance,time
0,Liverpool,Southampton,1,5,0,0
1,Tirana,Blackpool,1,6,0,0
2,blackpool,tirana,1,7,0,0

,Origin,Destination,O_Code,D_code,distance,time
0,Paris,Lyon,1,8,0,0
1,Manchester,Nice,1,10,0,0
2,Orleans,Madrid,1,12,0,0

,Origin,Destination,O_Code,D_code,distance,time
0,Lisbon,Stockholm,1,12,0,0