Question

我有三组140+数据集以奇怪的方式布局，我无法弄清楚如何使用Python重新排列它们。文件排列在顶部4行，然后是4个空行，后跟5列。没有标题，第1行和第2行是一列，第3行是2列，第4行是垃圾。前三行是我的数据集标识符。每个数据集都有多个记录。例如：

xx4 <--ID
070414 <--DateStrong
5.6 10 <--Force Ratio
Sample Rate:  50/s <--Garbage


220.68   0.14   17.80   92.20
220.80   0.02    9.40    9.40
224.32   0.14   14.60   72.20
227.08   0.14   26.60   130.60
227.78   0.08   19.60   62.00
228.04   0.18   40.40   257.20
231.22   0.12   14.00   61.20

我正在尝试将该集安排为：

xx4, 070414, 5.6, 10, 220.68, 0.14, 17.80, 92.20
xx4, 070414, 5.6, 10, 220.80, 0.02, 9.40, 9.40
xx4, 070414, 5.6, 10, 224.32, 0.14, 14.60, 72.20

我目前的工作代码是：

import os
import sys
import csv
import pandas as pd
import numpy as np
import itertools as it
import benFuncts.BenFuncts as bf #My own functions
import matplotlib.pyplot as plt

ID = []

ID_dict = {}
DATE = []
FORCE = []
RATIO = []

TIME = []
DURR = []
pF = []
TOF = []

ED7 = []
ED6 = []
ED5 = []
ED4 = []

h = 'DATE', 'DAYNUM', 'RATIO', 'CRIT', 'TOTRESP', 'CRITRESP', 'PELLETS', 'AVG_PF', 'AVG_TOF'

Crit = {}


MastList = []
rd_files = []  # List of file strings


# Makes the main file path in this case:
# /Users/benlibman/Desktop/EffortDemandTests/EffortDemandPyTests/
path = str(os.getcwd()) + '/'

# List of files in the working directory (see path above)
mainDir = os.listdir(str(os.getcwd()) + '/')

# Pulls the list files from the mainDir (above)
ID = [i for i in mainDir if len(i) <= 3 and 'ED' in i]

# f_Out = csv.writer(open('MainFile', 'wa'), delimiter=',')
# f_Out = open('MainFile', 'wa')
# , quoting=csv.QUOTE_NONE)
f_In = csv.reader(open('ED7',  'rb'), delimiter='\t')


def mkPath():
    for row in f_In:
        for i in row:
            if len(i) > 1:
                rd_files.append(path + str(i))

mP = mkPath()


# pdmF = pd.read_csv('MainFile', sep='\t', engine='python')
# with open('ED7120214', 'r') as f:


df = pd.read_csv(open('ED7120214', 'r'), sep='\t', skiprows=5, usecols=(
    0, 1, 2, 3), names=('TIME', 'DURR', 'pF', 'TOF'))

frCR = pd.read_csv(open('ED7120214', 'r'), sep=' ', skiprows=(0, 1, 3), skipfooter=(
    len(df)), engine='python', index_col=False, names=('FORCE', 'RATIO'))

date_index = pd.read_csv(open('ED7120214', 'r'), squeeze=True, sep=' ', skiprows=(
    0, 2, 3), skipfooter=(len(df)), engine='python', index_col=False, names=('DATE', 'NaN'))

id_index = pd.read_csv(open('ED7120214', 'r'), squeeze=True, sep=' ', skiprows=(
    1, 2, 3), skipfooter=(len(df)), engine='python', index_col=False, names=('ID', 'NaN'))


pDF = pd.DataFrame(df)

for row in pDF.TIME:
    TIME.append(row)

for row in pDF.DURR:
    DURR.append(row)

for row in pDF.pF:
    pF.append(row)

for row in pDF.TOF:
    TOF.append(row)

print pDF.pF.mean()

FORCE.append(frCR.FORCE)
RATIO.append(frCR.RATIO)

DATE.append(list(date_index.DATE))
ID_dict.update(id_index.ID)

DATE = [str(i).strip('[]') for i in DATE]

# ED7.append(FORCE)
# ED7.append(DATE)
# ED7.append(RATIO)
ED7.append(TIME)
ED7.append(DURR)
ED7.append(pF)
ED7.append(TOF)

Dt = bf.addCol(range(len(TIME)), DATE)


with open('MainFile', 'wa') as mf:
    pDF.to_csv(mf, header=True, index_names=True, names=(
        'DATE', 'DAYNUM', 'TIME', 'DURR', 'pF', 'TOF'))

Answer 1

如果你要做的就是重新格式化数据并将其写回文件，这应该适用于你的例子中的文件格式：

with open('data.txt') as in_file, open('new.txt', 'w') as out_file:
    # get the dataset identifiers
    ID = in_file.next().strip()
    date_strong = in_file.next().strip()
    force_ratio = in_file.next().strip()
    force_ratio1, force_ratio2 = force_ratio.split()
    in_file.next()  # Garbage line
    # example data has two blank lines
    in_file.next()
    in_file.next()
    dataset_id = (ID, date_strong, force_ratio1, force_ratio2)
    # iterate over the records
    for line in in_file:
        # prepend the dataset id
        record = list(dataset_id)
        record.extend(line.split())
        # write to the new file
        out_file.write(','.join(record) + '\n')

错误的数据集布局

1 个答案: