为什么我的自定义分配器比默认分配器慢

时间:2018-01-04 20:16:14

标签: c++ memory-management

我是C ++分配器的新手,花了一整天时间,尝试构建自己的分配器。我把A. Alecsandrescu Loki分配器当作踏脚石并按照this教程。最后,当我发现这个自定义分配器比默认分配器慢得多时,我做了一个工作分配器并且即将休息一下。这是整个代码:

#include <cstddef>
#include <iostream>
#include <vector>
#include <list>
#include <chrono>
#include <string>
using namespace std::chrono;

using uchar = unsigned char;

class Chunk 
{
private:    

    friend class FixedAllocator;

    void init(size_t blockSize, uchar blocks);

    void release();

    void* allocate(size_t blockSize);

    void deallocate(void* p, size_t blockSize);

    inline bool hasBlock(void* p, size_t chunkLen) const
    {
        uchar * pc = static_cast<uchar*>(p);
        return (pData <= pc) && (pc <= (pData + chunkLen));
    }

    inline bool releasable(uchar numBlocks) const
    {
          return blocksAvailable == numBlocks;    
    }

    uchar* pData;

    uchar firstAvailableBlock, blocksAvailable;    
};


void Chunk::init(size_t blockSize, uchar blocks)
{
     // for n of Ts it will allocate n * sizeof(T) memory
    pData = new uchar[blockSize * blocks];
    firstAvailableBlock = 0;
    blocksAvailable = blocks;
    uchar i = 0;
    uchar* p = pData;
    // used by allocate method to move forward firstAvailableBlock 
    for (; i != blocks; p += blockSize) 
    {
          *p = ++i;
    }
}


void Chunk::release()
{
    ::operator delete(pData);
}


void* Chunk::allocate(size_t blockSize)
{
     if (!blocksAvailable) return 0;
     // move firstAvailableBlock one block ahead
    uchar* pResult = pData + firstAvailableBlock * blockSize;
    firstAvailableBlock = *pResult;
    --blocksAvailable;
    return pResult;
}


void Chunk::deallocate(void* p, size_t blockSize)
{
    uchar* toRelease = static_cast<uchar*>(p);
    // find last but one available block
    firstAvailableBlock = static_cast<uchar>((toRelease - pData) / blockSize);
    ++blocksAvailable;
}


class FixedAllocator 
{
private:
    size_t blockSize;
    uchar blocks;
    using Chunks = std::vector<Chunk>;
    Chunks chunks;
    Chunk* allocChunk;
public:
    FixedAllocator();
    ~FixedAllocator();
    void init(size_t blockSize, size_t pageSize);
    void * allocate();
    void deallocate(void* p);
};


FixedAllocator::FixedAllocator():
    blockSize(0),
    blocks(0),
    chunks(0),
    allocChunk(nullptr)
{
}


FixedAllocator::~FixedAllocator()
{
    Chunks::iterator it;
    for (it = chunks.begin(); it != chunks.end(); ++it)
    {
        it->release();    
    }
}


void FixedAllocator::init(size_t blockSize_, size_t pageSize)
{
     blockSize = blockSize_;
    size_t numBlocks = pageSize / blockSize;
    blocks = static_cast<uchar>(numBlocks);
}


void* FixedAllocator::allocate()
{
     if (!allocChunk || allocChunk->blocksAvailable == 0)
    {
        Chunks::iterator it = chunks.begin();    
        for (;;++it)
        {
            if (it == chunks.end())
            {
                 // allocate memory for one more chunk
                chunks.reserve(chunks.size() + 1);
                Chunk newChunk;  
                newChunk.init(blockSize, blocks);
                // add new chunk to memory pool
                chunks.push_back(newChunk);                
                // points to new just initiated chunk
                allocChunk = &chunks.back();
                break;
            }
            if (it->blocksAvailable > 0)
            {
                 // points to chunk with available blocks
                allocChunk = &*it;
                break;            
            }                   
        }
    }
    return allocChunk->allocate(blockSize);
}


void FixedAllocator::deallocate(void* p)
{
    size_t chunkLen = blocks * blockSize;
    Chunks::iterator it;
    int cPos = 0;
    for (it = chunks.begin(); it != chunks.end(); ++it, ++cPos)
    {
        if (it->hasBlock(p, chunkLen))
        {
            it->deallocate(p, blockSize);  
            if (it->releasable(blocks)) {
                it->release();
                chunks.erase(chunks.begin() + cPos);
                // allocChunk may point to deleted chunk
                // so, reset it
                if (!chunks.empty()) {
                    allocChunk = &chunks.back();
                } else {
                    allocChunk = nullptr;                
                }
            } else {
                // there are free blocks in chunk
                // so, reset allocChunk for fast search
                allocChunk = &*it;    
            }
            break;   
        }    
    } 
}


class SmallObjAllocator
{
public:
    SmallObjAllocator(size_t pageSize, size_t maxObjectSize);
    void* allocate(size_t numBytes);
    void deallocate(void* p, size_t numBytes);
private:
    FixedAllocator* pool;
    size_t maxObjectSize;
};


SmallObjAllocator::SmallObjAllocator(size_t pageSize, size_t maxObjectSize_):
    pool(nullptr),
    maxObjectSize(maxObjectSize_)
{
    pool = new FixedAllocator[maxObjectSize];
    for (size_t i = 0; i < maxObjectSize; ++i)
    {
          pool[i].init(i + 1, pageSize); 
    }
}


void* SmallObjAllocator::allocate(size_t numBytes) {
    if (numBytes > maxObjectSize)
    {
        return ::operator new(numBytes);    
    }    
    FixedAllocator& alloc = pool[numBytes-1];
    return alloc.allocate();
}


void SmallObjAllocator::deallocate(void* p, size_t numBytes)
{
    if (numBytes > maxObjectSize)
    {
        ::operator delete(p);   
        return; 
    }
    FixedAllocator& alloc = pool[numBytes-1];
    alloc.deallocate(p);
}


template<typename T, size_t numBlocks = 64>
class Allocator  
{
public:

    Allocator(){};

    template<typename U, size_t N>
    Allocator(Allocator<U, N> const&);

    template<typename U>
    struct rebind 
    {
        using other = Allocator<U, numBlocks>;
    };

    T* allocate(size_t cnt) 
    {
          return reinterpret_cast<T*>(
              allocator.allocate(sizeof(T) * cnt)
          );       
    }

    void deallocate(T* p, size_t cnt) 
    {
        allocator.deallocate(p, sizeof(T) * cnt);
    }

    void construct(T* p, T const& val) 
    {
        ::new((void *)p) T(val);         
    } 

    void destroy(T* p) 
    {
        return ((T*) p)->~T();        
    } 

    using value_type = T;

private:
    static SmallObjAllocator allocator;       
};


template<typename T, size_t numBlocks>
SmallObjAllocator Allocator<T, numBlocks>::allocator(numBlocks * sizeof(T), sizeof(T));


template<class List>
void test(std::string comment, List l)
{
    std::cout << comment;
    auto start_time = high_resolution_clock::now();
    for (int i = 0; i < 10000; ++i)
    {
        l.push_back(i);    
    }
    auto end_time = high_resolution_clock::now();
    std::cout << duration_cast<milliseconds>(end_time  - start_time).count() << "ms" << std::endl;
}


int main() {     
     test("default list ", std::list<int>());   
     test("list with custom allocator ", std::list<int, Allocator<int, 10000>>());
     return 0;
}

正如您所看到的,在我的客户端代码中,我进行了一些分析,这个分析显示默认列表填充0ms,而自定义分配器的列表填充3ms。我认为整个问题都在于deallocate方法,并对其进行了评论,但仍然得到了相同的图片。那么,这种表现降级的原因可能是什么?我错过了什么?

1 个答案:

答案 0 :(得分:2)

默认分配器(std::allocator)通常作为newdelete周围相对较薄的包装器实现。

示例中的分配器似乎是混合sub / bump(增量)分配器。总之,如果分配器内存耗尽,它会从系统中分配一块内存,然后从可用的块中分配bump。

除其他外,请考虑:

  • 这不是线程安全的。并发访问最终会破坏它。这对于使用单个线程的孤立性能分析无关紧要,但仍然是一个重要的考虑因素。
  • 它在整个地方手动管理内存。即Chunk管理内存但没有析构函数,要求调用Chunk::release来销毁它(即 在~FixedAllocator())。使用RAII避免手动内存管理(即使在编写分配器时):

    class Chunk
    {
        // private: not required, classes are private by default.
        friend class FixedAllocator;
    
        // Replaced init(...) with constructor.
        Chunk(size_t blockSize, uchar block) :
            pData(new uchar[blockSize * blocks]),
            firstAvailableBlock(0),
            blocksAvailable(blocks) 
        {
            uchar* p = pData; 
            for (uchar i = 0; i != blocks; p += blockSize) 
            {
                *p = ++i;
            }
        }
        Chunk(const Chunk& other) = delete; // Disable copy construction.
        Chunk(Chunk&& other) :
            pData(std::move(other.pData)),
            firstAvailableBlock(other.firstAvailableBlock),
            blocksAvailable(other.blocksAvailable) 
        { 
            other.firstAvailableBlock = 0;
            other.blocksAvailable = 0;
        }
    
        Chunk& operator=(const Chunk&& other) = delete; // Disable copy assignment.
        Chunk& operator=(Chunk&& other)
        {
            pData = std::move(other.pData);
            firstAvailableBlock = other.firstAvailableBlock;
            blocksAvailable = other.blocksAvailable;
            other.firstAvailableBlock = 0;
            other.blocksAvailable = 0;
            return *this;
        }
    
        //...
        void release()
        {
            pData.reset();
        }
        //...
    
        std::unique_ptr<uchar[]> pData; // Automatically deleted in the implicitly generated destructor.
        uchar firstAvailableBlock, blocksAvailable; 
    };
    
    // And of course don't forget to update chunk creation:
    //...
    Chunk newChunk(blockSize, blocks);
    chunks.push_back(std::move(newChunk));
    //...
    
  • Chunk::hasBlock没有说明漏洞。如果要分配10个字节/ 5个字节/ 10个字节,然后取消分配5个字节的块,hasBlock将返回false以获取5个字节块内的范围,即使该空间实际可用。正确修复需要系统跟踪分配。

它的速度较慢,因为它比典型的std::allocator实现做了更多的整体工作。

  • 小对象尺寸设为sizeof(int), 这很可能是4. std::list节点的大小至少为12(back ptr(4-8),forward ptr(4-8),object(4+))。因此,至少对于列表节点,SmallObjAllocator::allocate()SmallObjAllocator::deallocate()不会调用newdelete,而是始终调用FixedAllocator::allocate()FixedAllocator::deallocate()

  • FixedAllocator::allocate()FixedAllocator::deallocate()速度很慢。它们都执行线性搜索,在最坏的情况下意味着它们遍历所有块。即使在一般情况下,在分配器而不是程序中花费了大量时间。优化这两个函数将产生最大的结果。

  • 您的分配器的blockSize设置为sizeof(int) * 10000(可能是40k)。因此,在std::list<int>中插入10k需要至少120kb(sizeof(node) * 10000),因此在您的示例中,FixedAllocator可能会调整至少两次(假设调整大小加倍)。您可以通过将blockSize设置得足够高以至于不需要调整大小来消除调整大小 Allocator<int, 100000>(100k)对你的榜样来说应该足够了。

分配器是一个非常复杂的主题,老实说,有太多的细节可以完全解释如何优化你的例子而不写一篇简短的小说。我建议阅读分配器设计并研究现实世界中使用的分配器,以便更好地理解该主题。

请参阅: