我正在尝试使用以下代码将489个csv文件列表读入pandas数据框
symbols = ['489 symbols']
def data(symbols):
dates=pd.date_range(start_date,end_date)
df=pd.DataFrame(index=dates)
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol})
df=df.join(df_temp)
df=df.fillna(method='ffill')
df=df.fillna(method='bfill')
return df
start_date = '2006-01-01'
end_date = '2016-12-31'
data_frame = data(symbols)
test = data_frame.loc['2006-01-01':'2006-12-31',:]
我希望测试数据框的形状为(365,489),但形状结果为(9874,489),无法找出原因?
假设我没有使用489个元素,而是使用1个元素,换句话说,如果我读取一个csv文件而不是489个,我会得到(365,1)的正确形状。
答案 0 :(得分:1)
以下代码工作
def data(symbols):
dates=pd.date_range(start_date,end_date)
df=pd.DataFrame(index=dates)
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol})
df_temp = df_temp.loc[~df_temp.index.duplicated(keep='first')]
df=df.join(df_temp)
df=df.fillna(method='ffill')
df=df.fillna(method='bfill')
return df
但仍然无法获得重复的内容?
答案 1 :(得分:0)
我认为您需要在每个循环中追加到list of Dataframe
并且最后concat
:
symbols = ['489 symbols']
def data(symbols):
dfs = []
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol}).ffill().bfill()
dfs.append(df_temp)
return pd.concat(dfs, axis=1)
start_date = '2006-01-01'
end_date = '2016-12-31'
data_frame = data(symbols)
test = data_frame.loc['2006-01-01':'2006-12-31',:]
如有必要,还可以添加date_range
添加reindex
:
symbols = ['489 symbols']
def data(symbols):
dfs = []
for symbol in symbols:
df_temp=pd.read_csv('/home/furqan/Desktop/USSP/{}.csv'.format(str(symbol)),usecols=['Date','Price'],
parse_dates=True,index_col='Date',na_values=['nan'])
df_temp = df_temp.rename(columns={'Price': symbol})
dfs.append(df_temp)
dates=pd.date_range(start_date,end_date)
return pd.concat(dfs, axis=1).reindex(dates, method='ffill').bfill()
start_date = '2006-01-01'
end_date = '2016-12-31'
data_frame = data(symbols)
test = data_frame.loc['2006-01-01':'2006-12-31',:]