这些天我正在学习python爬虫,我写了一个简单的爬虫来获取Pixiv ID Pixiv上的图片。
它工作得很好,但是出现了一个很大的问题:当它运行时,它在我的计算机上占用了近1.2G的内存。
然而,有时它只占用10M内存,我真的不知道哪个代码会导致如此大的内存使用。
我已将脚本上传到我的VPS(仅768M内存Vulter服务器)并尝试运行。结果,我得到了一个MerroyError。
所以我想知道如何优化内存使用量(即使花费更多时间来运行)。
这是我的代码:
(我已重写所有代码以使其通过pep8
,如果仍不清楚,请告诉我哪些代码让您感到困惑。)
from lxml import etree
import re
import os
import requests
# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p0")
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Picture_Type = [".png", ".jpg", ".gif"]
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
+ Posttime+Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"\
+ str(Pixiv_ID) + "_p0"\
+ Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
os.mkdir("./pic/" + str(Pixiv_ID))
Filter_Server = re.compile("[\d]+")
Filter_Posttime = re.compile("img\/[^_]*_p")
Manga_URL = "http://www.pixiv.net/"+Tag_a_href
Manga_HTML = requests.get(Manga_URL, headers=Headers)
Manga_XML = etree.HTML(Manga_HTML.content)
Manga_Pages = Manga_XML.xpath('/html/body'
'/nav[@class="page-menu"]'
'/div[@class="page"]'
'/span[@class="total"]/text()')[0]
Posttime = Filter_Posttime.findall(Tag_img_src)[0]
Server = Filter_Server.findall(Tag_img_src)[0]
Manga_HTML.close()
Picture_Type = [".png", ".jpg", ".gif"]
for Number in range(int(Manga_Pages)):
for i in range(len(Picture_Type)):
Original_URL = "http://i" + str(Server) + \
".pixiv.net/img-original/"\
+ Posttime + str(Number) + Picture_Type[i]
Picture = requests.get(Original_URL, headers=Headers, stream=True)
if Picture.status_code == 200:
break
if Picture.status_code != 200:
return -1
Filename = "./pic/"+str(Pixiv_ID) + "/"\
+ str(Pixiv_ID) + "_p"\
+ str(Number) + Picture_Type[i]
Picture_File = open(Filename, "wb+")
for chunk in Picture.iter_content(None):
Picture_File.write(chunk)
Picture_File.close()
Picture.close()
return 200
# Main function.
def get_pic(Pixiv_ID):
Index_URL = "http://www.pixiv.net/member_illust.php?"\
"mode=medium&illust_id="+str(Pixiv_ID)
Headers = {'referer': Index_URL}
Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
if Index_HTML.status_code != 200:
return Index_HTML.status_code
Index_XML = etree.HTML(Index_HTML.content)
Tag_a_href_List = Index_XML.xpath('/html/body'
'/div[@id="wrapper"]'
'/div[@class="newindex"]'
'/div[@class="newindex-inner"]'
'/div[@class="newindex-bg-container"]'
'/div[@class="cool-work"]'
'/div[@class="cool-work-main"]'
'/div[@class="img-container"]'
'/a/@href')
Tag_img_src_List = Index_XML.xpath('/html/body'
'/div[@id="wrapper"]'
'/div[@class="newindex"]'
'/div[@class="newindex-inner"]'
'/div[@class="newindex-bg-container"]'
'/div[@class="cool-work"]'
'/div[@class="cool-work-main"]'
'/div[@class="img-container"]'
'/a/img/@src')
if Tag_a_href_List == [] or Tag_img_src_List == []:
return 404
else:
Tag_a_href = Tag_a_href_List[0]
Tag_img_src = Tag_img_src_List[0]
Index_HTML.close()
if Tag_a_href.find("manga") != -1:
return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
else:
return get_single(Pixiv_ID, Tag_img_src, Headers)
# Check whether the picture already exists.
def check_exist(Pixiv_ID):
if not os.path.isdir("Pic"):
os.mkdir("Pic")
if os.path.isdir("./Pic/"+str(Pixiv_ID)):
return True
Picture_Type = [".png", ".jpg", ".gif"]
Picture_Exist = False
for i in range(len(Picture_Type)):
Path = "./Pic/" + str(Pixiv_ID)\
+ "_p0" + Picture_Type[i]
if os.path.isfile(Path):
return True
return Picture_Exist
# The script starts here.
for i in range(0, 38849402):
Pixiv_ID = 38849402-i
Picture_Exist = check_exist(Pixiv_ID)
if not Picture_Exist:
Return_Code = get_pic(Pixiv_ID)
if Return_Code == 200:
print str(Pixiv_ID), "finish!"
elif Return_Code == -1:
print str(Pixiv_ID), "got an unknown error."
elif Return_Code == 404:
print str(Pixiv_ID), "not found. Maybe deleted."
else:
print str(Pixiv_ID), "picture exists!"
答案 0 :(得分:1)
OMG!
最后,我知道出了什么问题。
我使用mem_top()
查看占用内存的内容。
猜猜是什么?
是for i in range(0, 38849402):
在内存中,有一个列表[0,1,2,3 ... 38849401],它占用了我的记忆。
我将其更改为:
Pixiv_ID = 38849402
while Pixiv_ID > 0:
some code here
Pixiv_ID = Pixiv_ID-1
现在内存使用量不超过20M。
感到兴奋!