Question

我是python中的多处理新手。我正在从70,000个URL列表中提取一些功能。我有两个不同的文件。在特征提取过程之后，我将结果传递给列表，然后传递给CSV文件。

代码运行但随后因错误而停止。我试图捕获错误，但它产生了另一个错误。

Python版本= 3.5

from feature_extractor import Feature_extraction

import pandas as pd

from pandas.core.frame import DataFrame

import sys

from multiprocessing.dummy import Pool as ThreadPool

import threading as thread

from multiprocessing import Process,Manager,Array

import time

class main():

lst = None

def __init__(self):
    manager = Manager()
    self.lst = manager.list()
    self.dostuff()  
    self.read_lst()

def feature_extraction(self,url): 
        if self.lst is None:
            self.lst = []

        features = Feature_extraction(url) 
        self.lst.append(features.get_features())
        print(len(self.lst))



def Pool(self,url):   
        pool = ThreadPool(8) 
        results = pool.map(self.feature_extraction, url)

def dostuff(self):
    df = pd.read_csv('verified_online.csv',encoding='latin-1')
    df['label'] = df['phish_id'] * 0
    mal_urls = df['url']

    df2 = pd.read_csv('new.csv') 
    df2['label'] = df['phish_id']/df['phish_id']
    ben_urls = df2['urls']
    t = Process(target=self.Pool,args=(mal_urls,))
    t2 = Process(target=self.Pool,args=(ben_urls,))
    t.start()
    t2.start()
    t.join()
    t2.join 

def read_lst(self):
    nw_df = DataFrame(list(self.lst))

    nw_df.columns = ['Redirect count','ssl_classification','url_length','hostname_length','subdomain_count','at_sign_in_url','exe_extension_in_request_url','exe_extension_in_landing_url',
                        'ip_as_domain_name','no_of_slashes_in requst_url','no_of_slashes_in_landing_url','no_of_dots_in_request_url','no_of_dots_in_landing_url','tld_value','age_of_domain',
                        'age_of_last_modified','content_length','same_landing_and_request_ip','same_landing_and_request_url']
    frames = [df['label'],df2['label']]
    new_df = pd.concat(frames)
    new_df = new_df.reset_index()
    nw_df['label'] = new_df['label']
    nw_df.to_csv('dataset.csv', sep=',', encoding='latin-1')

 if __name__ == '__main__':



start_time = time.clock()
try:
    main()

except BrokenPipeError:
    print("broken pipe....")
    pass

print (time.clock() - start_time, "seconds")

错误追溯

Process Process-3:
Traceback (most recent call last):
  File "F:\Continuum\Anaconda3\lib\multiprocessing\connection.py", line 312, in _recv_bytes
    nread, err = ov.GetOverlappedResult(True)
BrokenPipeError: [WinError 109] The pipe has been ended

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "F:\Continuum\Anaconda3\lib\multiprocessing\process.py", line 249, in _bootstrap
    self.run()
  File "F:\Continuum\Anaconda3\lib\multiprocessing\process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "H:\Projects\newoproject\src\main.py", line 33, in Pool
    results = pool.map(self.feature_extraction, url)
  File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 260, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 608, in get
    raise self._value
  File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "F:\Continuum\Anaconda3\lib\multiprocessing\pool.py", line 44, in mapstar
    return list(map(*args))
  File "H:\Projects\newoproject\src\main.py", line 26, in feature_extraction
    self.lst.append(features.get_features())
  File "<string>", line 2, in append
  File "F:\Continuum\Anaconda3\lib\multiprocessing\managers.py", line 717, in _callmethod
    kind, result = conn.recv()
  File "F:\Continuum\Anaconda3\lib\multiprocessing\connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "F:\Continuum\Anaconda3\lib\multiprocessing\connection.py", line 321, in _recv_bytes
    raise EOFError
EOFError

Answer 1

我的回复很晚，并没有直接解决发布的问题;但希望能为遇到类似错误的其他人提供线索。

我遇到的错误： BrokenPipeError WinError 109管道已经结束＆amp; WinError 232管道正在关闭

在Windows 7上使用Python 36观察，时间：（1）多次提交相同的异步函数，每次都使用多处理数据存储的不同实例，在我的情况下是一个队列（multiprocessing.Manager（）。Queue（））和（2）对队列的引用保存在包络函数中的短寿命局部变量中。

尽管与成功生成和执行的异步函数共享的队列具有项目并且在异常时仍然处于活动状态（put（）＆amp; get（）），但错误仍在发生。

当第二次使用Queue实例调用相同的async_func时，始终发生错误。紧接在函数的apply_async（）之后，第一次提供给async_func的第一个队列的连接将被破坏。

当对队列的引用保存在非重叠（如队列列表）和放大器中时，问题得到了解决。包络函数中的较长寿命变量（如返回到调用堆栈中较高函数的变量）。

BrokenPipeError：[WinError 109]数据提取期间管道已结束

1 个答案: