我有以下代码(Python 3.5):
datarALL = []
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
futh = [(executor.submit(self.getdata2, page, hed, data, apifolder,additional)) for page in pages]
for data in as_completed(futh):
datarALL = datarALL + data.result()
return datarALL
这将创建线程,执行函数并将结果组合到列表中。
当我小规模执行时,效果很好,但是当页面数很大时,脚本会写:
Killed
当我使用htop
对其进行监视时,我发现Killed
是由于内存问题所致。
我试图将datarALL = datarALL + data.result()
转换为写入文件,以便每个完成的线程将结果存储在磁盘上,而不是存储在内存上。
这就是我所做的:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
futh = [(executor.submit(self.getdata2, page, hed, data, apifolder, additional)) for page in
pages]
for data in as_completed(futh):
datarALL = [data.result()]
with open("test.txt", "wb") as fp:
pickle.dump(datarALL, fp)
with open("test.txt", "rb") as fp: # Unpickling
b = pickle.load(fp)
return b
但是内存没有清除,脚本仍然被杀死。
我该如何解决这个问题?
我需要此脚本来支持处理未知数量的数据。
编辑:
添加getdata2
代码:
def getdata2(self, page, hed, data, apifolder, additional):
tries = 10
for n in range(tries):
try:
value_limit = self.config._page_limit # limit of records allowed per page
value_offset = page * value_limit
datarALL = []
url = 'http://www.mywebsite.com/{2}?WithTotal=true&cultureid=2&offset={0}&limit={1}{3}'.format(
value_offset, value_limit, apifolder, additional)
print ("Generate page: #{0} run #{1} with URL: {2}".format(page, n, url))
responsedata = requests.get(url, data=data, headers=hed, verify=False)
if responsedata.status_code == 200: # 200 for successful call
responsedata = responsedata.text
jsondata = json.loads(responsedata)
if "results" in jsondata:
if jsondata["results"]:
datarALL = datarALL + jsondata["results"]
print ("page {} finished".format(page))
return datarALL
except ChunkedEncodingError as e:
print ("page #{0} run #{1} failed. Retry.".format(page, n))
if n == tries - 1:
print ("page {0} could not be imported. Max retried reached.".format(page))
print("Unexpected error:", sys.exc_info()[0])
raise e
日志:
num of records to import is 21348
num of pages to import is 86
2018-08-27 09:47:42.210912 Generate page: #0 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=0&limit=249
2018-08-27 09:47:42.218939 Generate page: #1 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=249&limit=249
2018-08-27 09:47:42.227159 Generate page: #2 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=498&limit=249
2018-08-27 09:47:42.228641 Generate page: #3 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=747&limit=249
2018-08-27 09:48:03.721129 page 0 finished
2018-08-27 09:48:03.721510 Generate page: #4 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=996&limit=249
2018-08-27 09:48:19.740866 page 2 finished
2018-08-27 09:48:19.741651 Generate page: #5 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1245&limit=249
2018-08-27 09:48:23.633712 page 4 finished
2018-08-27 09:48:23.634187 Generate page: #6 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1494&limit=249
2018-08-27 09:48:43.598300 page 1 finished
2018-08-27 09:48:43.599237 Generate page: #7 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1743&limit=249
page #6 run #0 failed. Retry.
2018-08-27 09:48:43.671394 Generate page: #6 run #1 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1494&limit=249
page #5 run #0 failed. Retry.
2018-08-27 09:48:44.198029 Generate page: #5 run #1 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1245&limit=249
2018-08-27 09:48:57.072556 page 6 finished
2018-08-27 09:48:57.073005 Generate page: #8 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1992&limit=249
2018-08-27 09:49:11.236083 page 5 finished
2018-08-27 09:49:11.245397 Generate page: #9 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=2241&limit=249
2018-08-27 09:49:13.057340 page 8 finished
2018-08-27 09:49:13.057516 Generate page: #10 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=2490&limit=249
2018-08-27 09:49:33.802848 page 3 finished
2018-08-27 09:49:33.813404 Generate page: #11 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=2739&limit=249
2018-08-27 09:49:41.440440 page 10 finished
2018-08-27 09:49:41.440915 Generate page: #12 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=2988&limit=249
page #7 run #0 failed. Retry.
2018-08-27 09:49:41.500190 Generate page: #7 run #1 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=1743&limit=249
2018-08-27 09:49:50.171217 page 11 finished
2018-08-27 09:49:50.189446 Generate page: #13 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=3237&limit=249
2018-08-27 09:49:54.881509 page 12 finished
2018-08-27 09:49:54.881826 Generate page: #14 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=3486&limit=249
2018-08-27 09:50:06.699138 page 14 finished
2018-08-27 09:50:06.708714 Generate page: #15 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=3735&limit=249
2018-08-27 09:50:17.203238 page 13 finished
2018-08-27 09:50:17.203766 Generate page: #16 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=3984&limit=249
2018-08-27 09:50:18.200983 page 15 finished
2018-08-27 09:50:18.201452 Generate page: #17 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=4233&limit=249
2018-08-27 09:50:29.642942 page 7 finished
.
.
.
2018-08-27 09:55:59.088085 page 42 finished
2018-08-27 09:55:59.088767 Generate page: #44 run #0 with URL: http://www.myweb.com?WithTotal=true&cultureid=2&offset=10956&limit=249
Killed
更新的代码:
datarALL = []
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
futh = [(executor.submit(self.getdata3, page, hed, data, apifolder,additional)) for page in pages]
for data in as_completed(futh):
#datarALL = datarALL + data.result()
datarALL.append(data.result())
return datarALL
和功能:
def getdata3(self, page, hed, data, apifolder, additional):
tries = 10
for n in range(tries):
try:
value_limit = self.config.page_limit # limit of records allowed per page
value_offset = page * value_limit
datarALL = []
url = 'http://www.myebsite.com/{2}?WithTotal=true&cultureid=2&offset={0}&limit={1}{3}'.format(
value_offset, value_limit, apifolder, additional)
print ("{3} Generate page: #{0} run #{1} with URL: {2}".format(page, n, url,str(datetime.now())))
responsedata = requests.get(url, data=data, headers=hed, verify=False)
if responsedata.status_code == 200: # 200 for successful call
responsedata = responsedata.text
jsondata = json.loads(responsedata)
if "results" in jsondata:
if jsondata["results"]:
datarALL.append( jsondata["results"])
print ("{1} page {0} finished".format(page,str(datetime.now())))
return datarALL
except ChunkedEncodingError as e:
print ("page #{0} run #{1} failed. Retry.".format(page, n))
if n == tries - 1:
print ("page {0} could not be imported. Max retried reached.".format(page))
print("Unexpected error:", sys.exc_info()[0])
raise e