import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
返回所选文件中的重复项。显示行号和transactionID。所以这是正确的。
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
这里我被困住了。
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
答案 0 :(得分:0)
查看glob模块。
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
步骤1 - 创建目录中的文件列表
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
第2步 - 循环浏览列表中的文件
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
<强>注释强>
如果你的代码中包含你的代码,显示你自己尝试这样做,你的问题会在几秒钟内得到解答。