这是我的第一个问题,请原谅任何错误。
我有一个大文件(csv),其中有几行(~10000000 +)信息,如下例所示:
date;box_id;box_length;box_width;box_height;weight;type
--snip--
1999-01-01 00:00:20;nx1124;10;4;5.5;2.1;oversea
1999-01-01 00:00:20;np11r4;8;3.25;2;4.666;local
--snip--
我的目标是阅读每一行并计算方框的音量,并在1小时内显示(例如,00:00:00 - 00:00:59)我必须记录是否有2个或更多方框具有相似的音量(+ -10%差异),然后记录他们的时间戳和类型。
目前,我正在使用蛮力方法:
例如,如果我的1小时窗口有1,2,3,4,我就这样做了
1
2 == 1
3 == 1 then == 2
4 == 1 then == 2 then == 3
5 == 2 then == 3 then == 4 # removed 1 from list(1hr window moved down)
6 == 2 then == 3 then == 4 then == 5
7 == 2 then == 3 then == 4 then == 5 then == 6
.... so on ....
这是我能想到的最好的,因为我必须在给定的时间窗口内将每个盒子与其他盒子进行比较。但目前这种情况非常缓慢。
我正在寻找更好的算法,但我不确定我必须去哪个方向。 我正在尝试学习一些优秀的工具(到目前为止,Pandas是我的最爱),但我假设我需要首先实现一些算法,以允许这些工具以我需要的方式处理数据。
如果有帮助我会发布我的python代码(来源)。
更新以下是我的代码。我省略了几行(例如try / catch块用于无效的文件路径/格式,类型转换错误处理等)。我已经为5秒窗口定制了一些代码。
以下是Box类
from datetime import datetime
from time import mktime
class Box(object):
""" Box model """
def __init__(self,data_set):
self.date = data_set[0]
self.timestamp = self.__get_time()
self.id = data_set[1]
self.length = float(data_set[2])
self.width = float(data_set[3])
self.height = float(data_set[4])
self.weight = int(data_set[5])
self.volume = self.__get_volume()
def __get_time(self):
""" convert from date string to unix-timestamp """
str_format = '%Y-%m-%d %H:%M:%S'
t_tuple = datetime.strptime(self.date, str_format).timetuple()
return mktime(t_tuple)
def __get_volume(self):
""" calculate volume of the box """
return (self.length * self.width * self.height)
以下是执行比较的实际程序。为方便起见,我将实用程序文件和main.py文件组合在一起。
from csv import reader
from io import open as open_file
from os import path
from sys import argv, exit
from time import time
# custom lib
from Box import Box
def main():
file_name = str.strip(argv[1])
boxes_5s = []
diff = 0
similar_boxes = []
for row in get_file(file_name):
if row:
box = Box(row)
if len(boxes_5s) > 0:
diff = box.timestamp - boxes_5s[0].timestamp
if diff < 6:
boxes_5s.append(box)
else:
similar_boxes += get_similar(boxes_5s)
del boxes_5s[0] # remove the oldest box
boxes_5s.append(box)
else:
boxes_5s.append(box)
print(similar_boxes)
def get_file(file_name):
""" open and return csv file pointer line by line """
with open_file(file_name,'rb') as f:
header = f.readline()
print(header)
rows = reader(f, delimiter=';')
for r in rows:
yield r
else:
yield ''
def get_similar(box_list):
""" compare boxes for similar volume """
num_boxes = len(box_list)
similar_boxes = []
record_str = "Box#{} Volm:{} and #{} Volm:{}"
for i in xrange(num_boxes):
box_1 = box_list[i]
for j in xrange(i+1, num_boxes):
box_2 = box_list[j]
vol_diff = abs((box_1.volume - box_2.volume)/box_1.volume) <= 0.1
if vol_diff: similar_boxes.append(record_str.format(box_1.id,box_1.volume,box_2.id, box_2.volume))
return similar_boxes
if __name__ == "__main__":
main()
谢谢。
答案 0 :(得分:1)
将第一个时间戳作为一小时窗口的开始(而不是时钟小时箱总是盯着小时:00:00)我认为对于数据量小到几千万行的数据可能是一个非常可行的实现(期待时间在文件中排序的条目):
#! /usr/bin/env python
from __future__ import print_function
import csv
import datetime as dt
import math
import collections
FILE_PATH_IN = './box_data_time_ordered_100k_sparse.csv'
TS_FORMAT = '%Y-%m-%d %H:%M:%S'
TS_TOKEN = 'date'
SIMILAR_ENOUGH = 0.1
BoxEntry = collections.namedtuple(
'BoxEntry', ['start_ts', 'a_ts', 't_type', 'b_volume'])
def box_volume(box_length, box_width, box_height):
"""Volume in cubic of length units given."""
return box_length * box_width * box_height
def filter_similar_box_volumes(box_entries):
"""Ordered binary similarity comparator using complex algorithm
on a medium large slice of data."""
def _key(r):
"""sort on volume."""
return r.b_volume
entries_volume_ordered = sorted(box_entries, key=_key)
collector = []
for n, box_entry in enumerate(entries_volume_ordered[1:], start=1):
one = box_entry.b_volume
prev_box_entry = entries_volume_ordered[n]
previous = prev_box_entry.b_volume
if one and math.fabs(one - previous) / one < SIMILAR_ENOUGH:
if box_entry not in collector:
collector.append(box_entry)
if prev_box_entry not in collector:
collector.append(prev_box_entry)
return collector
def hourly_boxes_gen(file_path):
"""Simplistic generator, yielding hour slices of parsed
box data lines belonging to 1 hour window per yield."""
csv.register_dialect('boxes', delimiter=';', quoting=csv.QUOTE_NONE)
start_ts = None
cx_map = None
hour_data = []
an_hour = dt.timedelta(hours=1)
with open(file_path, 'rt') as f_i:
for row in csv.reader(f_i, 'boxes'):
if cx_map is None and row and row[0] == TS_TOKEN:
cx_map = dict(zip(row, range(len(row))))
continue
if cx_map and row:
a_ts = dt.datetime.strptime(row[cx_map[TS_TOKEN]], TS_FORMAT)
t_type = row[cx_map['type']]
b_length = float(row[cx_map['box_length']])
b_width = float(row[cx_map['box_width']])
b_height = float(row[cx_map['box_height']])
b_volume = box_volume(b_length, b_width, b_height)
if start_ts is None:
start_ts = a_ts
hour_data.append(
BoxEntry(start_ts, a_ts, t_type, b_volume))
elif a_ts - an_hour < start_ts:
hour_data.append(
BoxEntry(start_ts, a_ts, t_type, b_volume))
else:
yield filter_similar_box_volumes(hour_data)
hour_data = [BoxEntry(start_ts, a_ts, t_type, b_volume)]
start_ts = a_ts
if hour_data:
yield filter_similar_box_volumes(hour_data)
def main():
"""Do the thing."""
for box_entries in hourly_boxes_gen(FILE_PATH_IN):
for box_entry in box_entries:
print(box_entry.start_ts, box_entry.a_ts, box_entry.t_type)
if __name__ == '__main__':
main()
使用示例输入文件:
date;box_id;box_length;box_width;box_height;weight;type
1999-01-01 00:00:20;nx1124;10;4;5.5;2.1;oversea
1999-01-01 00:00:20;np11r4;8;3.25;2;4.666;local
1999-01-01 00:10:20;np11r3;8;3.25;2.1;4.665;local
1999-01-01 00:20:20;np11r2;8;3.25;2.05;4.664;local
1999-01-01 00:30:20;np11r1;8;3.23;2;4.663;local
1999-01-01 00:40:20;np11r0;8;3.22;2;4.662;local
1999-01-01 00:50:20;dp11r4;8;3.24;2;4.661;local
1999-01-01 01:00:20;cp11r3;8;3.25;2;4.666;local
1999-01-01 01:01:20;bp11r2;8;3.26;2;4.665;local
1999-01-01 01:02:20;ap11r1;8;3.22;2;4.664;local
1999-01-01 01:03:20;zp11r0;12;3.23;2;4.663;local
1999-01-01 02:00:20;yp11r4;8;3.24;2;4.662;local
1999-01-01 04:00:20;xp11r4;8;3.25;2;4.661;local
1999-01-01 04:00:21;yy11r4;8;3.25;2;4.661;local
1999-01-01 04:00:22;xx11r4;8;3.25;2;4.661;oversea
1999-01-01 04:59:19;zz11r4;8;3.25;2;4.661;local
的产率:
1999-01-01 00:00:20 1999-01-01 00:30:20 local
1999-01-01 00:00:20 1999-01-01 00:50:20 local
1999-01-01 00:00:20 1999-01-01 00:00:20 local
1999-01-01 00:00:20 1999-01-01 00:20:20 local
1999-01-01 00:00:20 1999-01-01 00:10:20 local
1999-01-01 00:00:20 1999-01-01 00:00:20 oversea
1999-01-01 00:00:20 1999-01-01 01:00:20 local
1999-01-01 01:00:20 1999-01-01 01:01:20 local
1999-01-01 01:00:20 1999-01-01 01:03:20 local
1999-01-01 04:00:20 1999-01-01 04:00:21 local
1999-01-01 04:00:20 1999-01-01 04:00:22 oversea
1999-01-01 04:00:20 1999-01-01 04:59:19 local
一些注意事项:
用于阅读的csv模块,具有特定方言(分号不是默认分隔符)
使用别名导入日期时间,以访问strptime方法的datetime类而不覆盖模块名称 - YMMV
将分块小时窗口阅读器封装在生成器函数
单独功能中的体积和相似度计算。
体积有序的简单过滤算法,应该以某种方式O(m),m是候选匹配的数量。
使用命名元组进行紧凑存储,但也使用有意义的寻址。
要实现1小时的时钟调整窗口(不使用第一个时间戳来引导),需要稍微调整一下代码,但应该是微不足道的
否则好奇地等待来自OP的代码示例;-)
更新类似的足够的过滤算法,以便事件丰富的时间,不要让O(n ^ 2)算法吃掉我们所有的时间......(删除了嵌套循环的_naive) )。
每隔一天向样本添加一天的条目,其中3600个候选者进行相似性检查,这些约100k行(86400+)大约需要10秒钟。