我有三组140+数据集以奇怪的方式布局,我无法弄清楚如何使用Python重新排列它们。文件排列在顶部4行,然后是4个空行,后跟5列。没有标题,第1行和第2行是一列,第3行是2列,第4行是垃圾。前三行是我的数据集标识符。每个数据集都有多个记录。例如:
xx4 <--ID
070414 <--DateStrong
5.6 10 <--Force Ratio
Sample Rate: 50/s <--Garbage
220.68 0.14 17.80 92.20
220.80 0.02 9.40 9.40
224.32 0.14 14.60 72.20
227.08 0.14 26.60 130.60
227.78 0.08 19.60 62.00
228.04 0.18 40.40 257.20
231.22 0.12 14.00 61.20
我正在尝试将该集安排为:
xx4, 070414, 5.6, 10, 220.68, 0.14, 17.80, 92.20
xx4, 070414, 5.6, 10, 220.80, 0.02, 9.40, 9.40
xx4, 070414, 5.6, 10, 224.32, 0.14, 14.60, 72.20
我目前的工作代码是:
import os
import sys
import csv
import pandas as pd
import numpy as np
import itertools as it
import benFuncts.BenFuncts as bf #My own functions
import matplotlib.pyplot as plt
ID = []
ID_dict = {}
DATE = []
FORCE = []
RATIO = []
TIME = []
DURR = []
pF = []
TOF = []
ED7 = []
ED6 = []
ED5 = []
ED4 = []
h = 'DATE', 'DAYNUM', 'RATIO', 'CRIT', 'TOTRESP', 'CRITRESP', 'PELLETS', 'AVG_PF', 'AVG_TOF'
Crit = {}
MastList = []
rd_files = [] # List of file strings
# Makes the main file path in this case:
# /Users/benlibman/Desktop/EffortDemandTests/EffortDemandPyTests/
path = str(os.getcwd()) + '/'
# List of files in the working directory (see path above)
mainDir = os.listdir(str(os.getcwd()) + '/')
# Pulls the list files from the mainDir (above)
ID = [i for i in mainDir if len(i) <= 3 and 'ED' in i]
# f_Out = csv.writer(open('MainFile', 'wa'), delimiter=',')
# f_Out = open('MainFile', 'wa')
# , quoting=csv.QUOTE_NONE)
f_In = csv.reader(open('ED7', 'rb'), delimiter='\t')
def mkPath():
for row in f_In:
for i in row:
if len(i) > 1:
rd_files.append(path + str(i))
mP = mkPath()
# pdmF = pd.read_csv('MainFile', sep='\t', engine='python')
# with open('ED7120214', 'r') as f:
df = pd.read_csv(open('ED7120214', 'r'), sep='\t', skiprows=5, usecols=(
0, 1, 2, 3), names=('TIME', 'DURR', 'pF', 'TOF'))
frCR = pd.read_csv(open('ED7120214', 'r'), sep=' ', skiprows=(0, 1, 3), skipfooter=(
len(df)), engine='python', index_col=False, names=('FORCE', 'RATIO'))
date_index = pd.read_csv(open('ED7120214', 'r'), squeeze=True, sep=' ', skiprows=(
0, 2, 3), skipfooter=(len(df)), engine='python', index_col=False, names=('DATE', 'NaN'))
id_index = pd.read_csv(open('ED7120214', 'r'), squeeze=True, sep=' ', skiprows=(
1, 2, 3), skipfooter=(len(df)), engine='python', index_col=False, names=('ID', 'NaN'))
pDF = pd.DataFrame(df)
for row in pDF.TIME:
TIME.append(row)
for row in pDF.DURR:
DURR.append(row)
for row in pDF.pF:
pF.append(row)
for row in pDF.TOF:
TOF.append(row)
print pDF.pF.mean()
FORCE.append(frCR.FORCE)
RATIO.append(frCR.RATIO)
DATE.append(list(date_index.DATE))
ID_dict.update(id_index.ID)
DATE = [str(i).strip('[]') for i in DATE]
# ED7.append(FORCE)
# ED7.append(DATE)
# ED7.append(RATIO)
ED7.append(TIME)
ED7.append(DURR)
ED7.append(pF)
ED7.append(TOF)
Dt = bf.addCol(range(len(TIME)), DATE)
with open('MainFile', 'wa') as mf:
pDF.to_csv(mf, header=True, index_names=True, names=(
'DATE', 'DAYNUM', 'TIME', 'DURR', 'pF', 'TOF'))
答案 0 :(得分:0)
如果你要做的就是重新格式化数据并将其写回文件,这应该适用于你的例子中的文件格式:
with open('data.txt') as in_file, open('new.txt', 'w') as out_file:
# get the dataset identifiers
ID = in_file.next().strip()
date_strong = in_file.next().strip()
force_ratio = in_file.next().strip()
force_ratio1, force_ratio2 = force_ratio.split()
in_file.next() # Garbage line
# example data has two blank lines
in_file.next()
in_file.next()
dataset_id = (ID, date_strong, force_ratio1, force_ratio2)
# iterate over the records
for line in in_file:
# prepend the dataset id
record = list(dataset_id)
record.extend(line.split())
# write to the new file
out_file.write(','.join(record) + '\n')