425 425 -3 15000 15000 900 385 315 3 370 330 2 340 330 2
325 315 2 325 240 2 340 225 2 370 225 2 385 240 2
385 315 2 475 240 3 460 240 2 460 255 2 475 255 2
475 240 2 595 315 3 580 330 2 550 330 2 535 315 2
535 240 2 550 225 2 580 225 2 595 240 2 595 315 2
700 315 3 685 330 2 655 330 2 640 315 2 640 240 2
655 225 2 685 225 2 700 240 2 700 315 2 700 315 3
9076 456 2 9102 449 2 9127 443 2 9152 437 2 9178 433 2
9203 430 2 9229 428 2 9254 427 2 9280 425 2 9305 425 2
0 0 999 6865 259999
第一个数字是下一个文本块中的点数,然后文本块有多个点,每行最多5个点。每个点有3个组件(我将它们称为x,y,z)。 x和y得6个字符,而z得4,所以每个点需要16个字符。偶尔z为9999,导致y和z之间没有空格,因此使用split()将会解析这些行。此外,所有数字都是整数(没有小数),但有一些负数。
在实际文件中,块通常为1000个点,一些块较小(在“页面”的末尾,其中分页符由z = 9999表示)
import re
def get_points_regex(filename):
with open(filename, 'r') as f:
text = f.read()
points = []
for m in re.finditer('([ \d-]{6})([ \d-]{6})([ \d\-]{4})', text):
point = tuple(int(i) for i in m.groups())
return points
我的测试文件长55283行(4.4 MB),包含274761个点。
def get_points_regex2():
with open(filename, 'r') as f:
text = f.read()
points = re.findall(r'([ \d-]{6})([ \d-]{6})([ \d\-]{4})', text)
points = [tuple(map(int, point)) for point in points]
return points
def get_points_simple():
points = []
with open(filename, 'r') as f:
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points.append((x, y, z))
return points
def get_points_cython(filename):
cdef int i, x, y, z
points = []
f = open(filename, 'r')
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points.append((x, y, z))
return points
cython功能在196毫秒内运行。 (比纯python快2倍)
的元组列表,但python segfaulted(我假设发生了什么,IPython核心死了)。我有cdef int points[1000000][3]
然后在递增points[j][1] = x
import numpy as np
cimport numpy as np
DTYPE = np.int
ctypedef np.int_t DTYPE_t
def get_points_cython_numpy(filename):
cdef int i, j, x, y, z
cdef np.ndarray points = np.zeros([1000000, 3], dtype=DTYPE)
f = open(filename, 'r')
j = 0
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points[j, 0] = x
points[j, 1] = y
points[j, 2] = z
j = j + 1
return points
我是否遗漏了使用cython或python std lib显而易见的东西,这会使解析速度更快,或者这个速度与这个大小的文件一样快?
我考虑过pandas和numpy加载函数,但我认为块大小的行会使它太复杂。有一次,我有一些工作与pandas read_fwf后跟DataFrame.values.reshape(-1,3),然后删除带有NaNs的行,但我知道那时候必须要慢一些。
我希望能够在100毫秒以下获得这一点,以便在生成GUI时可以快速更新GUI。 (移动滑块>运行背景分析>加载数据>实时绘制结果)。
答案 0 :(得分:2)
import numpy as np
cimport numpy as np
import cython
cdef int fast_atoi(char *buff):
cdef int c = 0, sign = 0, x = 0
cdef char *p = buff
while True:
c = p[0]
if c == 0:
if c == 45:
sign = 1
elif c > 47 and c < 58:
x = x * 10 + c - 48
p += 1
return -x if sign else x
def get_points_cython_numpy(filename):
cdef int i, j, x, y, z, n_chunks
cdef bytes line, chunk
cdef int[:, ::1] points = np.zeros([500000, 3], np.int32)
f = open(filename, 'rb')
j = 0
for line in f:
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = fast_atoi(chunk[0:6])
y = fast_atoi(chunk[6:12])
z = fast_atoi(chunk[12:16])
points[j, 0] = x
points[j, 1] = y
points[j, 2] = z
j = j + 1
return points.base[:j]
cdef inline int fast_atoi(char *buf, int size):
cdef int i=0 ,c = 0, sign = 0, x = 0
for i in range(size):
c = buf[i]
if c == 0:
if c == 45:
sign = 1
elif c > 47 and c < 58:
x = x * 10 + c - 48
return -x if sign else x
def fastest_read_points(fn):
cdef bytes buf
with open(fn, "rb") as f:
buf = f.read().replace(b"\n", b"") # change it with your endline.
cdef char * p = buf
cdef int length = len(buf)
cdef char * buf_end = p + length
cdef int count = length // 16 * 2 # create enough large array
cdef int[:, ::1] res = np.zeros((count, 3), np.int32)
cdef int i, j, block_count
i = 0
while p < buf_end:
block_count = fast_atoi(p, 10)
p += 10
for j in range(block_count):
res[i, 0] = fast_atoi(p, 6)
res[i, 1] = fast_atoi(p+6, 6)
res[i, 2] = fast_atoi(p+12, 4)
p += 16
i += 1
return res.base[:i]
答案 1 :(得分:1)
def read_chunk_numpy(fh, n_points):
# 16 chars per point, plus one newline character for every 5 points
n_bytes = n_points * 16 + (n_points + 1) // 5
txt_arr = np.fromfile(fh, 'S1', n_bytes)
txt_arr = txt_arr[txt_arr != b'\n']
xyz = txt_arr.view('S6,S6,S4').astype('i,i,i')
xyz.dtype.names = 'x', 'y', 'z'
return xyz
import numpy as np
def write_testfile(fname, n_points):
with open(fname, 'wb') as fh:
for _ in range(n_points // 1000):
n_chunk = np.random.randint(900, 1100)
fh.write(str(n_chunk).rjust(8) + '\n')
xyz = np.random.randint(10**4, size=(n_chunk, 3))
for i in range(0, n_chunk, 5):
for row in xyz[i:i+5]:
fh.write('%6i%6i%4i' % tuple(row))
def read_chunk_plain(fh, n_points):
points = []
count = 0
# Use while-loop because `for line in fh` would mess with file pointer
while True:
line = fh.readline()
n_chunks = int(len(line)/16)
for i in range(n_chunks):
chunk = line[16*i:16*(i+1)]
x = int(chunk[0:6])
y = int(chunk[6:12])
z = int(chunk[12:16])
points.append((x, y, z))
count += 1
if count == n_points:
return points
def test(fname, read_chunk):
with open(fname, 'rb') as fh:
line = fh.readline().strip()
while line:
n = int(line)
read_chunk(fh, n)
line = fh.readline().strip()
fname = 'test.txt'
write_testfile(fname, 10**5)
%timeit test(fname, read_chunk_numpy)
%timeit test(fname, read_chunk_plain)