我正在用Python 2.6编写一个脚本(我是python的新手)。我想要实现的是最有效的方法:
我编写了以下脚本,该脚本有效,但它处理每个文件的速度都很慢。它在过去50分钟左右处理了大约118个文件:
import re, os, codecs
path = "./" #will search current directory
dir_lib = os.listdir(path)
for book in dir_lib:
if not book.endswith('.bin'): #only looks for files that have .bin extension
continue
file = os.path.join(path, book)
text = codecs.open(file, "r", "utf-8", errors="ignore")
#had to use "ignore" because I kept getting error with binary files:
#UnicodeDecodeError: 'utf8' codec can't decode byte 0x9a in position 10:
#unexpected code byte
for lineout in text:
w = re.search("(Keyword1\:)\s(\[(.+?)\])", lineout)
d = re.search("Keyword2\s(\[(.+?)\])", lineout)
outputfile = open('output.txt', 'w')
if w:
lineout = w.group(3) #first keyword that is between the [ ]
outputfile.write(lineout + ",")
elif d:
lineout = d.group(2) #second keyword that is between the [ ]
outputfile.write(lineout + ";")
outputfile.close()
text.close()
我的输出很好,正是我想要的:
keyword1,keyword2;keyword1,keyword2;etc,...;
但是这个速度需要大约一个月左右的时间才能连续运行。我可能尝试的其他任何东西,可能是正则表达式的替代品吗?一种方法是它不扫描整个文件,只是在找到关键字之后转到下一个文件?
感谢您的建议。
答案 0 :(得分:2)
一种方法是在unix操作系统中欺骗和模仿grep
,试试http://nedbatchelder.com/code/utilities/pygrep.py
import os
# Get the pygrep script.
if not os.path.exists('pygrep.py'):
os.system("wget http://nedbatchelder.com/code/utilities/pygrep.py")
from pygrep import grep, Options
# Writes a test file.
text="""This is a text
somehow there are many foo bar in the world.
sometimes they are black sheep,
sometimes they bar bar black sheep.
most times they foo foo here
and a foo foo there"""
with open('test.txt','w') as fout:
fout.write(text)
# Here comes the query
queries = ['foo','bar']
opt = Options() # set options for grep.
with open('test.txt','r') as fin:
for i in queries:
grep(i, fin, opt)
print
答案 1 :(得分:1)
您可以通过至少三种方式改进代码(按重要性降序排列):
注意:可能不是这种情况,因为recent patterns are cached最多。 (但没有充分的理由不这样做)
以下代码解决了这些问题:
import re, os, codecs
path = "./"
dir_lib = os.listdir(path)
w_pattern = re.compile("(Keyword1\:)\s(\[(.+?)\])")
d_pattern = re.compile("Keyword2\s(\[(.+?)\])")
with open('output.txt', 'w') as outputfile:
for book in dir_lib:
if not book.endswith('.bin'):
continue
filename = os.path.join(path, book)
with codecs.open(filename, "r", "utf-8", errors="ignore") as text:
w_found, d_found = False, False
for lineout in text:
w = w_pattern.search(lineout)
d = d_pattern.search(lineout)
if w:
lineout = w.group(3)
outputfile.write(lineout + ",")
w_found = True
elif d:
lineout = d.group(2)
outputfile.write(lineout + ";")
d_found = True
if w_found and d_found:
break
答案 2 :(得分:-1)
一些可能适用或可能不适用的简化:
所以:
import codecs
import glob
import re
START = re.compile("Keyword1\:\s\[(.+?)\]").match
END = re.compile("Keyword2\:\s\[(.+?)\]").match
def main():
with open('output.txt', 'w') as outf:
for fname in glob.glob('*.bin'):
with codecs.open(fname, 'rb', 'utf-8', errors='ignore') as inf:
w = None
for line in inf:
w = START(line)
if w:
break
d = None
for line in inf:
d = END(line)
if d:
break
if w and d:
outf.write('{0},{1};'.format(w.group(2), d.group(2)))
if __name__=="__main__":
main()