我在Python3中编写了一个简单的脚本。它枚举了POST请求的所有可能输入。我遇到的问题是在创建所有线程后内存不断增长,最后,由于内存不足,系统将终止内存。我使用Pympler检查了类myThread。结果显示myThread的所有实例的内存使用量没有快速增加。我不知道是什么导致了这种内存泄漏。
import requests
import threading
import time
class myThread(threading.Thread):
def __init__(self, threadID, name, st, ed):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.st = st
self.ed = ed
def run(self):
print("Starting "+self.name)
get_range(self.st, self.ed)
print("Exiting " + self.name)
def get_by_id(n):
payload = {"id":n}
url = "http://www.example.com" # This is for example
headers = { 'Content-Type': 'application/x-www-form-urlencoded',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
}
try:
r = requests.post(url, data=payload, headers=headers)
except Exception as e:
return -2
if r.status_code is not 200:
return -2
if "Cannot find" in r.text:
return -1
else:
with open(os.path.join("./pages", n), 'w') as f:
f.write(r.text)
return 1
def get_range(a, b):
for i in range(a, b):
r = get_by_id(str(i))
if __name__ == "__main__":
threads = []
for x in range(20):
threads.append(myThread(x, "Thread-"+str(x), 800000000000+x*4000, 800000000000+(x+1)*4000))
threads[-1].start()
time.sleep(0.3)
for t in threads:
t.join()
print("Exiting Main")
以下是删除可能导致内存问题的所有文件操作后的代码。
import requests
import threading
import time
class myThread(threading.Thread):
def __init__(self, threadID, name, st, ed):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.st = st
self.ed = ed
def run(self):
print("Starting "+self.name)
get_range(self.st, self.ed)
print("Exiting " + self.name)
def get_by_id(n):
payload = {"id":n}
url = "http://www.example.com" # This is for example
headers = { 'Content-Type': 'application/x-www-form-urlencoded',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
}
try:
r = requests.post(url, data=payload, headers=headers)
except Exception as e:
return -2
if r.status_code is not 200:
return -2
if "Cannot find" in r.text:
return -1
else:
return 1
def get_range(a, b):
for i in range(a, b):
r = get_by_id(str(i))
if __name__ == "__main__":
threads = []
for x in range(20):
threads.append(myThread(x, "Thread-"+str(x), 800000000000+x*4000, 800000000000+(x+1)*4000))
threads[-1].start()
time.sleep(0.3)
for t in threads:
t.join()
print("Exiting Main")
答案 0 :(得分:0)
我相信问题就在这里
else:
with open(os.path.join("./pages", n), 'w') as f:
f.write(r.text)
return 1
您正在打开一个越来越大的文件,并且由于打开该文件会将其写入ram,因此您使用的内存越来越大。
仅供参考:内存泄漏不会显示为分配给程序,而是由于程序无法解除内存释放而导致内存不足,而是让它超出范围。因此,我知道这不是内存泄漏。
更新: 我决定自己测试一下这个程序。它们都没有导致指数内存增长,尽管它们在准备写入时确实增加但在写入后却下降了。我可以给出的唯一建议是更新到请求模块的最新版本,如果这不起作用,请更新python本身。
答案 1 :(得分:0)
问题是,为了写入文件,open必须在操作系统中创建文件处理程序。每次调用open命令时,您都在创建一个新的文件处理程序。相反,您应该打开文件处理程序一次,然后将其作为参数传递给get_by_id。然后每个线程只有一个文件处理程序。
或者,您可以使用file.close()来释放操作系统资源。这可能最终由于文件超出范围时的垃圾收集而发生,但在这种情况下依赖GC是非常糟糕的做法。无论如何,在循环中创建不必要的对象是不好的做法。所以做一些像:
import requests
import threading
import time
class myThread(threading.Thread):
def __init__(self, threadID, name, st, ed):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.st = st
self.ed = ed
def run(self):
print("Starting "+self.name)
get_range(self.st, self.ed)
print("Exiting " + self.name)
def get_by_id(n, f):
payload = {"id":n}
url = "http://www.example.com" # This is for example
headers = { 'Content-Type': 'application/x-www-form-urlencoded',
'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
'Accept-Encoding':"gzip, deflate",
}
try:
r = requests.post(url, data=payload, headers=headers)
except Exception as e:
return -2
if r.status_code is not 200:
return -2
if "Cannot find" in r.text:
return -1
else:
f.write(r.text)
return 1
def get_range(a, b):
with open(os.path.join("./pages", n), 'w') as f:
for i in range(a, b):
r = get_by_id(str(i), f)
f.close();
if __name__ == "__main__":
threads = []
for x in range(20):
threads.append(myThread(x, "Thread-"+str(x), 800000000000+x*4000, 800000000000+(x+1)*4000))
threads[-1].start()
time.sleep(0.3)
for t in threads:
t.join()
print("Exiting Main")