Python Multiprocessing Process混合输出

时间:2018-07-31 21:18:26

标签: python python-multiprocessing

input_file.txt包含3000行,其中每行是:

['author1'](tabspace)xxxxxx(tabspace)['url1','url2']

['author2'](tabspace)xxxxxx(tabspace)['url3','url4'] 。 .. ...

我有兴趣提取每位作者的图像,并将一位作者的图像放在他自己的单独文件夹中。我使用了以下代码,它混淆了文件夹中的图像。

import ast
import re
import json
import os
import multiprocessing as mp
from multiprocessing import Pool
from multiprocessing import Process,Lock
import random
import time
import urllib.request
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import string

file_path='/home/'
def link_status(link):
    try:
            r=requests.get(link,timeout=10)
            data=r.text
            soup=BeautifulSoup(data,"html.parser")
            if soup:
                    return soup
    except Exception as e:
            return None
def imp_images(authr,urllist):
    i=0
    final_path=file_path+str(authr)+"/"
    directory = os.path.dirname(final_path)
    if not os.path.exists(directory):
            os.makedirs(directory,exist_ok=True)
            os.chdir(directory)
    for each_link in urllist:
            status=link_status(each_link)
            if status:
                    for link in status.find_all('img'):
                            image_link=link.get('src')
                            if image_link is not None:
                                    if re.findall(r'(\bhttp|\bhttps)',image_link):
                                            try:
                                                    urllib.request.urlretrieve(image_link,authr+str(i))
                                                    i=i+1
                                            except:
                                                    pass
                                    else:
                                            try:
                                                    finallink=each_link+image_link
                                                    urllib.request.urlretrieve(finallink,authr+str(i))
                                                    i=i+1
                                            except:
                                                    pass
    os.chdir(file_path)
if __name__ == '__main__':
    q=mp.Queue()
    processes=[]
    with open('/home/input.txt') as f:
            for each in f:
                    each=each.split('\t')
                    authr=ast.literal_eval(each[0])
                    urls=ast.literal_eval(each[2])
                    p=mp.Process(target=imp_images,args=(authr[0].replace(" ","_"),urls))
                    processes.append(p)
            for proc in processes:
                    proc.start()
            for proc in processes:
                    proc.join()

我尝试使用Lock()并将代码更改为:

def imp_images(l,authr,urllist):
    l.acquire()
    i=0
    final_path=file_path+str(authr)+"/"
    directory = os.path.dirname(final_path)
    if not os.path.exists(directory):
        os.makedirs(directory,exist_ok=True)
        os.chdir(directory)

    for each_link in urllist:
        status=link_status(each_link)
        if status:
                for link in status.find_all('img'):
                        image_link=link.get('src')
                        if image_link is not None:
                                if re.findall(r'(\bhttp|\bhttps)',image_link):
                                        try:
                                                urllib.request.urlretrieve(image_link,authr+str(i))
                                                i=i+1
                                        except:
                                                pass
                                else:
                                        try:
                                                finallink=each_link+image_link
                                                urllib.request.urlretrieve(finallink,authr+str(i))
                                                i=i+1
                                        except:
                                                pass
    os.chdir(file_path)
    l.release()

主要针对:

if __name__ == '__main__':
    lock=Lock()
    q=mp.Queue()
    processes=[]
    with open('/home/input.txt') as f:
        for each in f:
            each=each.split('\t')
            authr=ast.literal_eval(each[0])
            urls=ast.literal_eval(each[2])
            p=mp.Process(target=imp_images,args=(lock,authr[0].replace(" ","_"),urls))
            processes.append(p)
        for proc in processes:
                proc.start()
        for proc in processes:
                proc.join()

在这种情况下,我收到“无法分配内存”错误。如何解决此错误?

我还尝试使用umlimit -n 4096,该代码在没有锁定的情况下工作,并且在其他情况下无济于事(我使用具有8gb RAM的Google云实例)

0 个答案:

没有答案