如何优化我的python爬虫的内存使用情况

时间:2017-03-08 02:40:22

标签: python python-2.7 web-crawler

这些天我正在学习python爬虫,我写了一个简单的爬虫来获取Pixiv ID Pixiv上的图片。

它工作得很好,但是出现了一个很大的问题:当它运行时,它在我的计算机上占用了近1.2G的内存。

然而,有时它只占用10M内存,我真的不知道哪个代码会导致如此大的内存使用。

我已将脚本上传到我的VPS(仅768M内存Vulter服务器)并尝试运行。结果,我得到了一个MerroyError。

所以我想知道如何优化内存使用量(即使花费更多时间来运行)。

这是我的代码:

(我已重写所有代码以使其通过pep8,如果仍不清楚,请告诉我哪些代码让您感到困惑。)

from lxml import etree
import re
import os
import requests


# Get a single Picture.
def get_single(Pixiv_ID, Tag_img_src, Headers):
    Filter_Server = re.compile("[\d]+")
    Filter_Posttime = re.compile("img\/[^_]*_p0")
    Posttime = Filter_Posttime.findall(Tag_img_src)[0]
    Server = Filter_Server.findall(Tag_img_src)[0]
    Picture_Type = [".png", ".jpg", ".gif"]
    for i in range(len(Picture_Type)):
        Original_URL = "http://i" + str(Server) + ".pixiv.net/img-original/"\
                       + Posttime+Picture_Type[i]
        Picture = requests.get(Original_URL, headers=Headers, stream=True)
        if Picture.status_code == 200:
            break
    if Picture.status_code != 200:
        return -1
    Filename = "./pic/"\
               + str(Pixiv_ID) + "_p0"\
               + Picture_Type[i]
    Picture_File = open(Filename, "wb+")
    for chunk in Picture.iter_content(None):
        Picture_File.write(chunk)
    Picture_File.close()
    Picture.close()
    return 200


# Get manga which is a bundle of pictures.
def get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers):
    os.mkdir("./pic/" + str(Pixiv_ID))
    Filter_Server = re.compile("[\d]+")
    Filter_Posttime = re.compile("img\/[^_]*_p")
    Manga_URL = "http://www.pixiv.net/"+Tag_a_href
    Manga_HTML = requests.get(Manga_URL, headers=Headers)
    Manga_XML = etree.HTML(Manga_HTML.content)
    Manga_Pages = Manga_XML.xpath('/html/body'
                                  '/nav[@class="page-menu"]'
                                  '/div[@class="page"]'
                                  '/span[@class="total"]/text()')[0]
    Posttime = Filter_Posttime.findall(Tag_img_src)[0]
    Server = Filter_Server.findall(Tag_img_src)[0]
    Manga_HTML.close()
    Picture_Type = [".png", ".jpg", ".gif"]
    for Number in range(int(Manga_Pages)):
        for i in range(len(Picture_Type)):
            Original_URL = "http://i" + str(Server) + \
                           ".pixiv.net/img-original/"\
                           + Posttime + str(Number) + Picture_Type[i]
            Picture = requests.get(Original_URL, headers=Headers, stream=True)
            if Picture.status_code == 200:
                break
        if Picture.status_code != 200:
            return -1
        Filename = "./pic/"+str(Pixiv_ID) + "/"\
                   + str(Pixiv_ID) + "_p"\
                   + str(Number) + Picture_Type[i]
        Picture_File = open(Filename, "wb+")
        for chunk in Picture.iter_content(None):
            Picture_File.write(chunk)
        Picture_File.close()
        Picture.close()
    return 200


# Main function.
def get_pic(Pixiv_ID):
    Index_URL = "http://www.pixiv.net/member_illust.php?"\
                "mode=medium&illust_id="+str(Pixiv_ID)
    Headers = {'referer': Index_URL}
    Index_HTML = requests.get(Index_URL, headers=Headers, stream=True)
    if Index_HTML.status_code != 200:
        return Index_HTML.status_code
    Index_XML = etree.HTML(Index_HTML.content)
    Tag_a_href_List = Index_XML.xpath('/html/body'
                                      '/div[@id="wrapper"]'
                                      '/div[@class="newindex"]'
                                      '/div[@class="newindex-inner"]'
                                      '/div[@class="newindex-bg-container"]'
                                      '/div[@class="cool-work"]'
                                      '/div[@class="cool-work-main"]'
                                      '/div[@class="img-container"]'
                                      '/a/@href')
    Tag_img_src_List = Index_XML.xpath('/html/body'
                                       '/div[@id="wrapper"]'
                                       '/div[@class="newindex"]'
                                       '/div[@class="newindex-inner"]'
                                       '/div[@class="newindex-bg-container"]'
                                       '/div[@class="cool-work"]'
                                       '/div[@class="cool-work-main"]'
                                       '/div[@class="img-container"]'
                                       '/a/img/@src')
    if Tag_a_href_List == [] or Tag_img_src_List == []:
        return 404
    else:
        Tag_a_href = Tag_a_href_List[0]
        Tag_img_src = Tag_img_src_List[0]
    Index_HTML.close()
    if Tag_a_href.find("manga") != -1:
        return get_manga(Pixiv_ID, Tag_a_href, Tag_img_src, Headers)
    else:
        return get_single(Pixiv_ID, Tag_img_src, Headers)


# Check whether the picture already exists.
def check_exist(Pixiv_ID):
    if not os.path.isdir("Pic"):
        os.mkdir("Pic")
    if os.path.isdir("./Pic/"+str(Pixiv_ID)):
        return True
    Picture_Type = [".png", ".jpg", ".gif"]
    Picture_Exist = False
    for i in range(len(Picture_Type)):
        Path = "./Pic/" + str(Pixiv_ID)\
               + "_p0" + Picture_Type[i]
        if os.path.isfile(Path):
            return True
    return Picture_Exist


# The script starts here.
for i in range(0, 38849402):
    Pixiv_ID = 38849402-i
    Picture_Exist = check_exist(Pixiv_ID)
    if not Picture_Exist:
        Return_Code = get_pic(Pixiv_ID)
        if Return_Code == 200:
            print str(Pixiv_ID), "finish!"
        elif Return_Code == -1:
            print str(Pixiv_ID), "got an unknown error."
        elif Return_Code == 404:
            print str(Pixiv_ID), "not found. Maybe deleted."
    else:
        print str(Pixiv_ID), "picture exists!"

1 个答案:

答案 0 :(得分:1)

OMG!

最后,我知道出了什么问题。

我使用mem_top()查看占用内存的内容。

猜猜是什么?

for i in range(0, 38849402):

在内存中,有一个列表[0,1,2,3 ... 38849401],它占用了我的记忆。

我将其更改为:

Pixiv_ID = 38849402
while Pixiv_ID > 0:

    some code here

    Pixiv_ID = Pixiv_ID-1

现在内存使用量不超过20M。

感到兴奋!