问题陈述:
我在远程服务器中有多个(1000+)* .gz文件。我必须阅读这些文件并检查某些字符串。如果字符串匹配,我必须返回文件名。我试过以下代码。以下程序正在运行,但由于涉及巨大的IO,因此效率似乎不高。你能否提出一个有效的方法来做到这一点。
我的代码:
import gzip
import os
import paramiko
import multiprocessing
from bisect import insort
synchObj=multiprocessing.Manager()
hostname = '192.168.1.2'
port = 22
username='may'
password='Apa$sW0rd'
def miniAnalyze():
ifile_list=synchObj.list([]) # A synchronized list to Store the File names containing the matched String.
def analyze_the_file(file_single):
strings = ("error 72","error 81",) # Hard Coded the Strings that needs to be searched.
try:
ssh=paramiko.SSHClient()
#Code to FTP the file to local system from the remote machine.
.....
........
path_f='/home/user/may/'+filename
#Read the Gzip file in local system after FTP is done
with gzip.open(path_f, 'rb') as f:
contents = f.read()
if any(s in contents for s in strings):
print "File " + str(path_f) + " is a hit."
insort(ifile_list, filename) # Push the file into the list if there is a match.
os.remove(path_f)
else:
os.remove(path_f)
except Exception, ae:
print "Error while Analyzing file "+ str(ae)
finally:
if ifile_list:
print "The Error is at "+ ifile_list
ftp.close()
ssh.close()
def assign_to_proc():
# Code to glob files matching a pattern and pass to another function via multiprocess .
apath = '/home/remotemachine/log/'
apattern = '"*.gz"'
first_command = 'find {path} -name {pattern}'
command = first_command.format(path=apath, pattern=apattern)
try:
ssh=paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname,username=username,password=password)
stdin, stdout, stderr = ssh.exec_command(command)
while not stdout.channel.exit_status_ready():
time.sleep(2)
filelist = stdout.read().splitlines()
jobs = []
for ifle in filelist:
p = multiprocessing.Process(target=analyze_the_file,args=(ifle,))
jobs.append(p)
p.start()
for job in jobs:
job.join()
except Exception, fe:
print "Error while getting file names "+ str(fe)
finally:
ssh.close()
if __name__ == '__main__':
miniAnalyze()
上面的代码很慢。将GZ文件传输到本地系统时有很多IO。请帮助我找到更好的方法。
答案 0 :(得分:0)
执行远程OS命令(如zgrep),并在本地处理命令结果。这样,您就不必在本地计算机上传输整个文件内容。