错误的数据集布局

时间:2014-12-21 06:30:54

标签: python python-2.7 ipython ipython-notebook

我有三组140+数据集以奇怪的方式布局,我无法弄清楚如何使用Python重新排列它们。文件排列在顶部4行,然后是4个空行,后跟5列。没有标题,第1行和第2行是一列,第3行是2列,第4行是垃圾。前三行是我的数据集标识符。每个数据集都有多个记录。例如:

xx4 <--ID
070414 <--DateStrong
5.6 10 <--Force Ratio
Sample Rate:  50/s <--Garbage


220.68   0.14   17.80   92.20
220.80   0.02    9.40    9.40
224.32   0.14   14.60   72.20
227.08   0.14   26.60   130.60
227.78   0.08   19.60   62.00
228.04   0.18   40.40   257.20
231.22   0.12   14.00   61.20

我正在尝试将该集安排为:

xx4, 070414, 5.6, 10, 220.68, 0.14, 17.80, 92.20
xx4, 070414, 5.6, 10, 220.80, 0.02, 9.40, 9.40
xx4, 070414, 5.6, 10, 224.32, 0.14, 14.60, 72.20

我目前的工作代码是:

import os
import sys
import csv
import pandas as pd
import numpy as np
import itertools as it
import benFuncts.BenFuncts as bf #My own functions
import matplotlib.pyplot as plt

ID = []

ID_dict = {}
DATE = []
FORCE = []
RATIO = []

TIME = []
DURR = []
pF = []
TOF = []

ED7 = []
ED6 = []
ED5 = []
ED4 = []

h = 'DATE', 'DAYNUM', 'RATIO', 'CRIT', 'TOTRESP', 'CRITRESP', 'PELLETS', 'AVG_PF', 'AVG_TOF'

Crit = {}


MastList = []
rd_files = []  # List of file strings


# Makes the main file path in this case:
# /Users/benlibman/Desktop/EffortDemandTests/EffortDemandPyTests/
path = str(os.getcwd()) + '/'

# List of files in the working directory (see path above)
mainDir = os.listdir(str(os.getcwd()) + '/')

# Pulls the list files from the mainDir (above)
ID = [i for i in mainDir if len(i) <= 3 and 'ED' in i]

# f_Out = csv.writer(open('MainFile', 'wa'), delimiter=',')
# f_Out = open('MainFile', 'wa')
# , quoting=csv.QUOTE_NONE)
f_In = csv.reader(open('ED7',  'rb'), delimiter='\t')


def mkPath():
    for row in f_In:
        for i in row:
            if len(i) > 1:
                rd_files.append(path + str(i))

mP = mkPath()


# pdmF = pd.read_csv('MainFile', sep='\t', engine='python')
# with open('ED7120214', 'r') as f:


df = pd.read_csv(open('ED7120214', 'r'), sep='\t', skiprows=5, usecols=(
    0, 1, 2, 3), names=('TIME', 'DURR', 'pF', 'TOF'))

frCR = pd.read_csv(open('ED7120214', 'r'), sep=' ', skiprows=(0, 1, 3), skipfooter=(
    len(df)), engine='python', index_col=False, names=('FORCE', 'RATIO'))

date_index = pd.read_csv(open('ED7120214', 'r'), squeeze=True, sep=' ', skiprows=(
    0, 2, 3), skipfooter=(len(df)), engine='python', index_col=False, names=('DATE', 'NaN'))

id_index = pd.read_csv(open('ED7120214', 'r'), squeeze=True, sep=' ', skiprows=(
    1, 2, 3), skipfooter=(len(df)), engine='python', index_col=False, names=('ID', 'NaN'))


pDF = pd.DataFrame(df)

for row in pDF.TIME:
    TIME.append(row)

for row in pDF.DURR:
    DURR.append(row)

for row in pDF.pF:
    pF.append(row)

for row in pDF.TOF:
    TOF.append(row)

print pDF.pF.mean()

FORCE.append(frCR.FORCE)
RATIO.append(frCR.RATIO)

DATE.append(list(date_index.DATE))
ID_dict.update(id_index.ID)

DATE = [str(i).strip('[]') for i in DATE]

# ED7.append(FORCE)
# ED7.append(DATE)
# ED7.append(RATIO)
ED7.append(TIME)
ED7.append(DURR)
ED7.append(pF)
ED7.append(TOF)

Dt = bf.addCol(range(len(TIME)), DATE)


with open('MainFile', 'wa') as mf:
    pDF.to_csv(mf, header=True, index_names=True, names=(
        'DATE', 'DAYNUM', 'TIME', 'DURR', 'pF', 'TOF'))

1 个答案:

答案 0 :(得分:0)

如果你要做的就是重新格式化数据并将其写回文件,这应该适用于你的例子中的文件格式:

with open('data.txt') as in_file, open('new.txt', 'w') as out_file:
    # get the dataset identifiers
    ID = in_file.next().strip()
    date_strong = in_file.next().strip()
    force_ratio = in_file.next().strip()
    force_ratio1, force_ratio2 = force_ratio.split()
    in_file.next()  # Garbage line
    # example data has two blank lines
    in_file.next()
    in_file.next()
    dataset_id = (ID, date_strong, force_ratio1, force_ratio2)
    # iterate over the records
    for line in in_file:
        # prepend the dataset id
        record = list(dataset_id)
        record.extend(line.split())
        # write to the new file
        out_file.write(','.join(record) + '\n')