我很难以一种易于在熊猫数据框中处理的方式来格式化csv文件。我正在使用这个https://figshare.com/articles/UMA_ADL_FALL_Dataset_zip/4214283摔倒数据的数据集来训练RNN模型来检测摔倒的人,但是使用python csv阅读器甚至使用更智能的模块clevercsv都很难清除格式。
这是用于遍历文件并将其合并为数据框的代码:
import pandas as pd
import zipfile
import clevercsv as csv
csv_list = []
directory = r"C:\Users\20191678\OneDrive - TU Eindhoven\Engineering Design"
for filename in os.listdir(directory):
if '.csv' in filename:
with open(filename, "r", newline="") as fp:
dialect = csv.Sniffer().sniff(fp.read(), verbose=True)
fp.seek(0)
reader = csv.reader(fp, dialect)
rows = list(reader)
csv_list.append(rows)
df = pd.DataFrame(csv_list)
如果有人可以花时间解决这个问题并制作结构化的数据框,那将是很好的选择!或提出另一种清理方法。
csv文件代码本身:
% Universidad de Malaga - ETSI de Telecomunicacion (Spain)
% Date: 2017-04-14_23:38:23
% ID: Subject_01_ADL_Aplausing_1
% Name: Subject_01
% Age: 67
% Height(cm): 156
% Weight(Kg): 76
% Gender: F
% Type of Movement: ADL
% Type of Movement: FALSE
% Description of the movement: Aplausing
% Trial: 1
% Number of Sensors: 5
% Used Smartphone: LGE-lge-LG-H815-5.1
% Smartphone's Accelerometer: LGE Accelerometer - Vendor: BOSCH
% --> Version: 1
% --> Min - Max Delay: 5000us - 65535000us
% --> Maximum Range: 16.000000263891405 G
% --> Resolution: 1.2136514986004396E-4 G
% SensorTag's Accelerometer: MPU-9250 MEMS MotionTracking Device - Invensense
% --> Maximum Range: 16 G
% --> Resolution: 0.00024 G
% MAC Address; Sensor_ID; Position; Device Model
%f8:95:c7:f3:ba:82; 0; RIGHTPOCKET; lge-LG-H815-5.1
%C4:BE:84:71:A5:02; 2; WAIST; SensorTag
%C4:BE:84:70:0E:80; 3; WRIST; SensorTag
%B0:B4:48:B8:77:03; 4; ANKLE; SensorTag
%C4:BE:84:70:64:8A; 1; CHEST; SensorTag
% Sensor_Type:
% Accelerometer = 0
% Gyroscope = 1
% Magnetometer = 2
% TimeStamp; Sample No; X-Axis; Y-Axis; Z-Axis; Sensor Type; Sensor ID;
102;1;-0.1387496441602707;0.8868721723556519;0.3310287296772003;0;0
102;2;-0.1381397247314453;0.8865065574645996;0.3323715031147003;0;0
102;3;-0.1348443180322647;0.8895576596260071;0.3311501145362854;0;0
102;4;-0.1402153074741364;0.8866279125213623;0.3337142467498779;0;0
102;5;-0.1391168385744095;0.8862622380256653;0.3345684409141541;0;0
102;6;-0.138628289103508;0.8871164321899414;0.3346897959709168;0;0
102;7;-0.1367969810962677;0.8880935311317444;0.3412821888923645;0;0
102;8;-0.138628289103508;0.8883378505706787;0.3398165106773377;0;0
102;9;-0.1409481465816498;0.8901675939559937;0.3401837050914764;0;0
102;10;-0.1418023407459259;0.8891920447349548;0.3418920934200287;0;0
102;11;-0.1430221647024155;0.8882149457931519;0.3420134484767914;0;0
103;12;-0.143510714173317;0.8880935311317444;0.3422577381134033;0;0
103;13;-0.1439992785453796;0.8838210105895996;0.3379867672920227;0;0
103;14;-0.1431450843811035;0.8795484900474548;0.3353012502193451;0;0
103;15;-0.1438763588666916;0.8766187429428101;0.3331027626991272;0;0
103;16;-0.1429008096456528;0.8790599703788757;0.3321272134780884;0;0
103;17;-0.142656534910202;0.8779615163803101;0.3343241512775421;0;0
103;18;-0.1409481465816498;0.8801584243774414;0.3348127007484436;0;0
103;19;-0.1429008096456528;0.8816241025924683;0.3376195728778839;0;0
103;20;-0.1457076668739319;0.8821110725402832;0.3385966718196869;0;0
109;21;-0.1441206336021423;0.8832111358642578;0.3412821888923645;0;0
115;22;-0.1387496441602707;0.8832111358642578;0.3404279947280884;0;0
115;23;-0.1391168385744095;0.8822340369224548;0.3404279947280884;0;0
121;24;-0.1375298053026199;0.8843095898628235;0.3399394154548645;0;0
126;25;-0.1369199007749558;0.8868721723556519;0.337375283241272;0;0
133;26;-0.1375298053026199;0.8854080438613892;0.331394374370575;0;0
答案 0 :(得分:1)
这样的事情应该会让你前进。
from pprint import pprint
def try_number(s):
try:
if "." in s:
return float(s)
return int(s, 10)
except ValueError:
return s
def read_umafall(fp):
header_lines = []
metadata = {}
data = []
for line in fp:
line = line.strip()
if line.startswith("%"):
if ": " in line:
key, _, value = line[1:].partition(": ")
metadata[key.strip()] = value
else:
header_lines.append(line)
elif ";" in line:
data.append([try_number(c) for c in line.split(";")])
elif line:
print("???", line)
return {
"header_lines": header_lines,
"metadata": metadata,
"data": data,
}
with open(
"UMAFall_Subject_01_ADL_HandsUp_2_2017-04-14_23-33-21.csv",
"r",
) as fp:
result = read_umafall(fp)
pprint(result["metadata"])
pprint(result["header_lines"])
pprint(result["data"][:10])
输出例如
{'--> Maximum Range': '16 G',
'--> Min - Max Delay': '5000us - 65535000us',
'--> Resolution': '0.00024 G',
'--> Version': '1',
'Age': '67',
'Date': '2017-04-14_23:33:21',
'Description of the movement': 'HandsUp',
'Gender': 'F',
'Height(cm)': '156',
'ID': 'Subject_01_ADL_HandsUp_2',
'Name': 'Subject_01',
'Number of Sensors': '5',
"SensorTag's Accelerometer": 'MPU-9250 MEMS MotionTracking Device - '
'Invensense',
"Smartphone's Accelerometer": 'LGE Accelerometer - Vendor: BOSCH',
'Trial': '2',
'Type of Movement': 'FALSE',
'Used Smartphone': 'LGE-lge-LG-H815-5.1',
'Weight(Kg)': '76'}
['% Universidad de Malaga - ETSI de Telecomunicacion (Spain)',
'% MAC Address; Sensor_ID; Position; Device Model',
'%f8:95:c7:f3:ba:82; 0; RIGHTPOCKET; lge-LG-H815-5.1',
'%C4:BE:84:71:A5:02; 2; WAIST; SensorTag',
'%C4:BE:84:70:0E:80; 3; WRIST; SensorTag',
'%B0:B4:48:B8:77:03; 4; ANKLE; SensorTag',
'%C4:BE:84:70:64:8A; 1; CHEST; SensorTag',
'% Sensor_Type:',
'% Accelerometer = 0',
'% Gyroscope = 1',
'% Magnetometer = 2',
'% TimeStamp; Sample No; X-Axis; Y-Axis; Z-Axis; Sensor Type; Sensor ID;']
[[371, 1, -0.01265575457364321, 0.9133599400520325, -0.1938552260398865, 0, 0],
[371, 2, -0.01839394308626652, 0.9126286506652832, -0.1926354020833969, 0, 0],
[371, 3, -0.01802674867212772, 0.9129943251609802, -0.1948323398828507, 0, 0],
[371, 4, -0.02352065965533257, 0.9167782664299011, -0.1969063729047775, 0, 0],
[371, 5, -0.02315346524119377, 0.9209294319152832, -0.2019117176532745, 0, 0],
[371, 6, -0.01888094283640385, 0.9211721420288086, -0.203375831246376, 0, 0],
[371, 7, -0.0208351630717516, 0.9270316958427429, -0.2050857692956924, 0, 0],
[371, 8, -0.01924813725054264, 0.9303271174430847, -0.2070384472608566, 0, 0],
[371, 9, -0.01766111142933369, 0.9342340230941772, -0.2080155462026596, 0, 0],
[371, 10, -0.01265575457364321, 0.9388721585273743, -0.2115552425384522, 0, 0]]
```,
that is
* first the header lines that could be parsed as key-value pairs
* other header lines
* the data
You can hopefully trust each file to have the data in the same order (`TimeStamp; Sample No; X-Axis; Y-Axis; Z-Axis; Sensor Type; Sensor ID`).