Boost Asio async_read有时会在阅读时挂起,但并非总是如此

时间:2014-03-25 17:38:50

标签: c++ boost boost-asio

我正在实现一个包含N台机器的小型分布式系统。它们中的每一个都从一些远程服务器接收一些数据,然后将数据传播到其他n-1个同类机器。我正在使用Boost Asio async_read和async_write来实现它。我建立了一个N = 30台机器的测试集群。当我尝试较小的日期集(每台机器接收75KB到750KB)时,程序始终有效。但当我转向一个略大的数据集(7.5MB)时,我观察到了奇怪的行为:开始时,读取和写入按预期发生,但过了一段时间,一些机器挂起而其他机器完成,挂机的机器数量每次运行都有所不同。我尝试在每个处理程序中打印出一些消息,并发现对于那些挂起的机器,async_read基本上在一段时间后基本上无法成功读取,因此之后无法继续。我检查了远程服务器,他们都写完了。我已经尝试使用strand来控制异步读写的执行顺序,我也尝试使用不同的io_services进行读写。他们都没有解决问题。我非常绝望。任何人都可以帮助我吗?

以下是执行读取和传播的类的代码:

const int TRANS_TUPLE_SIZE=15;
const int TRANS_BUFFER_SIZE=5120/TRANS_TUPLE_SIZE*TRANS_TUPLE_SIZE;
class Asio_Trans_Broadcaster
{
private:
   char buffer[TRANS_BUFFER_SIZE];
   int node_id;
   int mpi_size;
   int mpi_rank;
   boost::asio::ip::tcp::socket* dbsocket;
   boost::asio::ip::tcp::socket** sender_sockets;
   int n_send;
   boost::mutex mutex;
   bool done;
public:
   Asio_Trans_Broadcaster(boost::asio::ip::tcp::socket* dbskt, boost::asio::ip::tcp::socket** senderskts,
        int msize, int mrank, int id)
{
    dbsocket=dbskt;
    count=0;
    node_id=id;
    mpi_size=mpi_rank=-1;
    sender_sockets=senderskts;
    mpi_size=msize;
    mpi_rank=mrank;
    n_send=-1;
    done=false;
}

static std::size_t completion_condition(const boost::system::error_code& error, std::size_t bytes_transferred)
{
    int remain=bytes_transferred%TRANS_TUPLE_SIZE;
    if(remain==0 && bytes_transferred>0)
        return 0;
    else
        return TRANS_BUFFER_SIZE-bytes_transferred;
}


void write_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
    int n=-1;
    mutex.lock();
    n_send--;
    n=n_send;
    mutex.unlock();
    fprintf(stdout, "~~~~~~ @%d, write_handler: %d bytes, copies_to_send: %d\n",
                                    node_id, bytes_transferred, n);
    if(n==0 && !done)
        boost::asio::async_read(*dbsocket,
            boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
            Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
            boost::asio::placeholders::error,
            boost::asio::placeholders::bytes_transferred));
}

void broadcast_handler(const boost::system::error_code &ec, std::size_t bytes_transferred)
{
    fprintf(stdout, "@%d, broadcast_handler: %d bytes, mpi_size:%d, mpi_rank: %d\n", node_id, bytes_transferred, mpi_size, mpi_rank);
    if (!ec)
    {
        int pos=0;
        while(pos<bytes_transferred && pos<TRANS_BUFFER_SIZE)
        {
            int id=-1;
            memcpy(&id, &buffer[pos], 4);
            if(id<0)
            {
                done=true;
                fprintf(stdout, "@%d, broadcast_handler: done!\n", mpi_rank);
                break;
            }

            pos+=TRANS_TUPLE_SIZE;
        }

        mutex.lock();
        n_send=mpi_size-1;
        mutex.unlock();
        for(int i=0; i<mpi_size; i++)
            if(i!=mpi_rank)
            {
                boost::asio::async_write(*sender_sockets[i], boost::asio::buffer(buffer, bytes_transferred),
                                boost::bind(&Asio_Trans_Broadcaster::write_handler, this,
                                boost::asio::placeholders::error,
                                boost::asio::placeholders::bytes_transferred));
            }
    }
    else
    {
        cerr<<mpi_rank<<" error: "<<ec.message()<<endl;
      delete this;
    }


}

void broadcast()
{
    boost::asio::async_read(*dbsocket,
            boost::asio::buffer(buffer, TRANS_BUFFER_SIZE),
            Asio_Trans_Broadcaster::completion_condition, boost::bind(&Asio_Trans_Broadcaster::broadcast_handler, this,
            boost::asio::placeholders::error,
            boost::asio::placeholders::bytes_transferred));
}
};

以下是每台机器上运行的主要代码:

int N=30;
boost::asio::io_service* sender_io_service=new boost::asio::io_service();
boost::asio::io_service::work* p_work=new boost::asio::io_service::work(*sender_io_service);
boost::thread_group send_thread_pool;
for(int i=0; i<NUM_THREADS; i++)
{
    send_thread_pool.create_thread( boost::bind( & boost::asio::io_service::run, sender_io_service ) );
}

boost::asio::io_service* receiver_io_service=new boost::asio::io_service();
shared_ptr<boost::asio::io_service::work> p_work2(new boost::asio::io_service::work(*receiver_io_service));
boost::thread_group thread_pool2;
thread_pool2.create_thread( boost::bind( & boost::asio::io_service::run, receiver_io_service) );

boost::asio::ip::tcp::socket* receiver_socket;
    //establish nonblocking connection with remote server
AsioConnectToRemote(5000, 1, receiver_io_service, receiver_socket, true);

boost::asio::ip::tcp::socket* send_sockets[N];
    //establish blocking connection with other machines
hadoopNodes = SetupAsioConnectionsWIthOthers(sender_io_service, send_sockets, hostFileName, mpi_rank, mpi_size, 3000, false);

Asio_Trans_Broadcaster* db_receiver=new Asio_Trans_Broadcaster(receiver_socket, send_sockets,
mpi_size,  mpi_rank, mpi_rank);

db_receiver->broadcast();
  p_work2.reset();
  thread_pool2.join_all();
  delete p_work;
send_thread_pool.join_all();

1 个答案:

答案 0 :(得分:2)

我不知道你的代码想要实现什么。丢失的位太多了。

当然,如果任务是在网络套接字上异步发送/接收流量,那么Asio就是这样的。很难看出你的代码有什么特别之处。

我建议清理更明显的问题:

  • (几乎)没有错误处理(检查你的error_code - s!)
  • 除非您使用的是有趣的平台,否则您的格式字符串应使用%lu size_t
  • 当你可以拥有一个向量时,为什么要乱用原始数组,可能有不好的大小?
  • 如果你可以使用sizeof,
  • 从不假设对象的大小:

    memcpy(&id, &trans_buffer[pos], sizeof(id));
    
  • 想到它,看起来缓冲区的索引无论如何都是不安全的:

        while(pos < bytes_transferred && pos < TRANS_BUFFER_SIZE)
        {
            int id = -1;
            memcpy(&id, &buffer[pos], sizeof(id));
    

    如果是pos == TRANS_BUFFER_SIZE-1这里memcpy调用Undefined Behavour ...

  • 为什么会有这么多new?你正在为你的代码邀请一大堆bug。好像内存管理不是低级编码的致命弱点。使用值或共享指针。 从不delete this 。以往 [1]

  • 为什么会有如此多的重复代码?为什么一个线程池以sender和另一个thread_pool2命名?其中包含1个主题。嗯?为什么有一个work项作为原始指针,另一个作为shared_ptr

    你可以只是:

    struct service_wrap {
        service_wrap(int threads) {
            while(threads--)
                pool.create_thread(boost::bind(&boost::asio::io_service::run, boost::ref(io_service)));
        }
    
        ~service_wrap() {
            io_service.post(boost::bind(&service_wrap::stop, this));
            pool.join_all();
        }
    
    private: // mind the initialization order!
        boost::asio::io_service io_service;
        boost::optional<boost::asio::io_service::work> work;
        boost::thread_group pool;
    
        void stop() { 
            work = boost::none;
        }
    };
    

    所以你可以写一下:

    service_wrap senders(NUM_THREADS);
    service_wrap receivers(1);
    

    哇。你看到了吗?不再有错误的机会。如果您修复了一个池,则会自动修复另一个池。第一个delete第二个.reset()项目不再work。简而言之:不再是混乱的代码,也不再那么复杂。

  • 使用异常安全锁定保护:

    int local_n_send = -1; // not clear naming
    {
        boost::lock_guard<boost::mutex> lk(mutex);
        n_send--;
        local_n_send = n_send;
    }
    
  • broadcast的正文在write_handler()中完全重复。为什么不直接称呼它:

    if(local_n_send == 0 && !done)
        broadcast();
    
  • 我认为仍然存在竞争条件 - 而不是访问n_send本身的数据竞争,但如果n_sendbroadcast()之后达到零,则重新广播的决定可能是错误的。锁被释放。现在,由于void write_handler(const error_code &ec, size_t bytes_transferred) { boost::lock_guard<boost::mutex> lk(mutex); if(!(done || --n_send)) broadcast(); } 只进行异步操作,你可以在锁定下完成它并摆脱竞争条件:

    #include <boost/asio.hpp>
    #include <boost/thread.hpp>
    #include <boost/array.hpp>
    #include <boost/make_shared.hpp>
    #include <boost/ptr_container/ptr_vector.hpp>
    #include <iostream>
    
    const/*expr*/ int TRANS_TUPLE_SIZE  = 15;
    const/*expr*/ int TRANS_BUFFER_SIZE = 5120 / TRANS_TUPLE_SIZE * TRANS_TUPLE_SIZE;
    
    namespace AsioTrans
    {
        using boost::system::error_code;
        using namespace boost::asio;
    
        typedef ip::tcp::socket             socket_t;
        typedef boost::ptr_vector<socket_t> socket_list;
    
        class Broadcaster
        {
        private:
            boost::array<char, TRANS_BUFFER_SIZE> trans_buffer;
    
            int node_id;
            int mpi_rank;
    
            socket_t&    dbsocket;
            socket_list& sender_sockets;
    
            int n_send;
            boost::mutex mutex;
            bool done;
        public:
            Broadcaster(
                socket_t& dbskt,
                socket_list& senderskts,
                int mrank,
                int id) : 
                    node_id(id),
                    mpi_rank(mrank),
                    dbsocket(dbskt),
                    sender_sockets(senderskts),
                    n_send(-1),
                    done(false)
            {
                // count=0;
            }
    
            static size_t completion_condition(const error_code& error, size_t bytes_transferred)
            {
                // TODO FIXME handler error_code here
                int remain = bytes_transferred % TRANS_TUPLE_SIZE;
    
                if(bytes_transferred && !remain)
                {
                    return 0;
                }
                else
                {
                    return TRANS_BUFFER_SIZE - bytes_transferred;
                }
            }
    
            void write_handler(const error_code &ec, size_t bytes_transferred)
            {
                // TODO handle errors
                // TODO check bytes_transferred
                boost::lock_guard<boost::mutex> lk(mutex);
    
                if(!(done || --n_send))
                    broadcast();
            }
    
            void broadcast_handler(const error_code &ec, size_t bytes_transferred)
            {
                fprintf(stdout, "@%d, broadcast_handler: %lu bytes, mpi_size:%lu, mpi_rank: %d\n", node_id, bytes_transferred, sender_sockets.size(), mpi_rank);
    
                if(!ec)
                {
                    for(size_t pos = 0; (pos < bytes_transferred && pos < TRANS_BUFFER_SIZE); pos += TRANS_TUPLE_SIZE)
                    {
                        int id = -1;
                        memcpy(&id, &trans_buffer[pos], sizeof(id));
    
                        if(id < 0)
                        {
                            done = true;
                            fprintf(stdout, "@%d, broadcast_handler: done!\n", mpi_rank);
                            break;
                        }
                    }
    
                    {
                        boost::lock_guard<boost::mutex> lk(mutex);
                        n_send = sender_sockets.size() - 1;
                    }
    
                    for(int i = 0; size_t(i) < sender_sockets.size(); i++)
                    {
                        if(i != mpi_rank)
                        {
                            async_write(
                                    sender_sockets[i], 
                                    buffer(trans_buffer, bytes_transferred),
                                    boost::bind(&Broadcaster::write_handler, this, placeholders::error, placeholders::bytes_transferred));
                        }
                    }
                }
                else
                {
                    std::cerr << mpi_rank << " error: " << ec.message() << std::endl;
                    delete this;
                }
            }
    
            void broadcast()
            {
                async_read(
                        dbsocket,
                        buffer(trans_buffer),
                        Broadcaster::completion_condition, 
                        boost::bind(&Broadcaster::broadcast_handler, this,
                            placeholders::error,
                            placeholders::bytes_transferred));
            }
        };
    
        struct service_wrap {
            service_wrap(int threads) {
                while(threads--)
                    _pool.create_thread(boost::bind(&io_service::run, boost::ref(_service)));
            }
    
            ~service_wrap() {
                _service.post(boost::bind(&service_wrap::stop, this));
                _pool.join_all();
            }
    
            io_service& service() { return _service; }
    
        private: // mind the initialization order!
            io_service                        _service;
            boost::optional<io_service::work> _work;
            boost::thread_group               _pool;
    
            void stop() { 
                _work = boost::none;
            }
        };
    
        extern void AsioConnectToRemote(int, int, io_service&, socket_t&, bool);
        extern void SetupAsioConnectionsWIthOthers(io_service&, socket_list&, std::string, int, bool);
    }
    
    int main()
    {
        using namespace AsioTrans;
    
        // there's no use in increasing #threads unless there are blocking operations
        service_wrap senders(boost::thread::hardware_concurrency()); 
        service_wrap receivers(1);
    
        socket_t receiver_socket(receivers.service());
        AsioConnectToRemote(5000, 1, receivers.service(), receiver_socket, true);
    
        socket_list send_sockets(30);
        /*hadoopNodes =*/ SetupAsioConnectionsWIthOthers(senders.service(), send_sockets, "hostFileName", 3000, false);
    
        int mpi_rank = send_sockets.size();
        AsioTrans::Broadcaster db_receiver(receiver_socket, send_sockets, mpi_rank, mpi_rank);
        db_receiver.broadcast();
    }
    

    Woop woop。那是现在的三行代码。更少的代码是更少的错误。

我的猜测是,如果你像这样勤奋地擦洗代码,你将不可避免地找到你的线索。想想它就像你会找到一个丢失的结婚戒指:你不会留下一堆乱七八糟的东西。相反,你会从一个房间走到另一个房间并整理一切。如果需要,首先将所有东西“扔出去”。

Iff 你可以让这个东西自包含/和/再现,我甚至会为你进一步调试它!

干杯

以下是我在查看代码时所做的出发点: Compiling on Coliru

{{1}}

[1] 没有例外。除非无例外规则有例外。异常ception。