是否有更多优化此代码的范围?

时间:2011-09-01 03:29:29

标签: c optimization malloc

我最近的任务是改进一些嵌入式代码,其中malloc花费了不成比例的时间。

客户希望通过一些非常简单的搜索和替换命令(即,对源代码没有大规模,复杂的更改)实现简单的修复,同时提供实质性的好处。

经过分析,看起来绝大多数(约80%)的分配都是128字节或更少。

为此,我使用quickpool汇总了一些概念验证代码,其思想是malloc的一个分配是在启动时完成的(足以容纳,例如,1024个块,每个128字节)这个快捷方式可以满足我们的分配。我希望删除malloc中更复杂的计算(更复杂,因为它必须处理不同大小的块)将导致速度增加。

然后,所有源代码都更改为使用quickpool而不是常规malloc。如果Quickpool用尽或分配所需的大小超过块大小,请求将传递到malloc

结果并不差,让我(大约)减少了50%的时间,但我想知道我是否可以做得更好。

快捷方式功能如下所示。首先是头文件:

#ifndef QUICKPOOL_H_INCLUDED
    #define QUICKPOOL_H_INCLUDED

    #include <stdlib.h>

    typedef struct qp_pool_s qp_pool;

    qp_pool *qp_create (size_t, size_t);
    qp_pool *qp_destroy (qp_pool *);
    void *qp_malloc (qp_pool *, size_t);
    void *qp_free (qp_pool *, void *);
#endif

createdestroy函数允许使用不同的块大小和块计数创建不同的池:

#include <string.h>
#include "quickpool.h"

// Various global values.

#define DEFLT_BLKS 1024
#define DEFLT_SZ    128

struct qp_pool_s {
    char *data;         // Actual blocks.
    char *enddata;      // First byte beyond data.
    char *isused;       // Used map for blocks.
    size_t total_blks;  // Number of blocks in pool.
    size_t free_blks;   // Free blocks in pool.
    size_t blk_sz;      // Size of each block.
    size_t low_free;    // Lowest free block.
};

qp_pool *qp_create (size_t quant, size_t sz) {
    // Zero means a default size.

    if (quant == 0) quant = DEFLT_BLKS;
    if (sz == 0)    sz = DEFLT_SZ;

    // Allocate memory blocks for pool.

    qp_pool *pool = malloc (sizeof (*pool));
    if (pool == NULL) return NULL;

    if ((pool->data = malloc (quant * sz)) == NULL) {
            free (pool);
            return NULL;
    }

    if ((pool->isused = malloc (quant)) == NULL) {
            free (pool->data);
            free (pool);
            return NULL;
    }

    // Set information on pool and return it.

    pool->enddata = &(pool->data[quant * sz]);
    memset (pool->isused, 0, quant);
    pool->total_blks = quant;
    pool->free_blks = quant;
    pool->blk_sz = sz;
    pool->low_free = 0;

    return pool;
}

qp_pool *qp_destroy (qp_pool *pool) {
    // Just free all the memory for pool.

    if (pool != NULL) {
            free (pool->data);
            free (pool->isused);
            free (pool);
    }

    return NULL;
}

然后有mallocfree对应物:

void *qp_malloc (qp_pool *pool, size_t sz) {
    int index;

    // If no pool, need more than BUFSZ bytes or pool empty, use default.

    if (pool == NULL) return malloc (sz);

    if ((sz > pool->blk_sz) || (pool->free_blks == 0))
            return malloc (sz);

    // Otherwise, get from quickpool. First we find a free block.

    for (index = pool->low_free; pool->isused[index]; index++)
            ;

    // Then we mark it used.

    pool->isused[index] = 1;
    pool->free_blks--;

    // Set lowest possible free block for speeding up next search.

    pool->low_free = index + 1;

    return &(pool->data[index * pool->blk_sz]);
}

void *qp_free (qp_pool *pool, void *ptr) {
    int index;

    // No pool created, use default.

    if (pool == NULL) {
            free (ptr);
            return NULL;
    }

    // Not in quick pool, use default.

    if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) {
            free (ptr);
            return NULL;
    }

    // This is a quickpool address, free it.

    index = ((char*)ptr - pool->data) / pool->blk_sz;
    pool->isused[index] = 0;
    pool->free_blks++;

    // Optimise next search.

    if (index < pool->low_free)
            pool->low_free = index;

    return NULL;
}

为完整起见,主要的测试程序如下:

#include <stdio.h>
#include <string.h>
#include <time.h>

#include "quickpool.h"

#define FREE  0
#define ALLOC 1

#define NUMPTRS 512
static void *pointer[NUMPTRS];
static size_t numPointers = 0;

int main (int argCount, char *argVal[]) {
    int count, val, index, memsz, stMode, seed;

    qp_pool *quickPool;

    seed = atoi (argVal[1]);
    stMode = (strcmp (argVal[2], "standard") == 0);

    srand (seed);
    int baseline = clock();
    quickPool = qp_create (0, 0);

    for (count = 0; count < 1000000; count++) {
        if (numPointers == 0)
            val = ALLOC;
        else if (numPointers == NUMPTRS)
            val = FREE;
        else if (numPointers > NUMPTRS/2)
            val = ((rand() % 100) < 50) ? FREE : ALLOC;
        else
            val = ((rand() % 100) < 33) ? FREE : ALLOC;

        if (val == FREE) {
            index = rand() % numPointers;
            if (stMode)
                free (pointer[index]);
            else
                qp_free (quickPool, pointer[index]);
            pointer[index] = pointer[--numPointers];
        } else {
            memsz = rand() % 160;
            if (stMode)
                pointer[numPointers++] = malloc (memsz);
            else
                pointer[numPointers++] = qp_malloc (quickPool, memsz);
        }
    }

    quickPool = qp_destroy (quickPool);

    baseline = clock() - baseline;
    printf ("%d\n", baseline * 1000 / CLOCKS_PER_SEC);

    return 0;
}

以及用于分析的shell脚本:

#!/usr/bin/bash
normal=0
quick=0
printf "    %10s  %10s\n" Normal Quick
printf "    ==========  ==========\n"
for iter1 in 0 1 ; do
    for iter2 in 0 1 2 3 4 5 6 7 8 9 ; do
            seed=$RANDOM

            val=$(./qptest.exe $seed standard)
            printf "${iter1}${iter2}  %10d  " $val
            ((normal = normal + val))

            val=$(./qptest.exe $seed quickpool)
            printf "%10d\n" $val
            ((quick = quick + val))
    done
done
printf "    ==========  ==========\n"
((pct = quick * 100 / normal))
printf "sum %10d  %10d (%d%%)\n" $normal $quick $pct

输出:

        Normal       Quick
    ==========  ==========
00         469         219
01         453         219
02         453         235
03         453         219
04         453         235
05         453         219
06         469         219
07         453         234
08         453         219
09         453         219
10         453         219
11         453         235
12         453         235
13         453         219
14         453         219
15         453         235
16         453         235
17         453         235
18         469         219
19         469         234
    ==========  ==========
sum       9124        4522 (49%)

现在我的问题如下:是否还有其他优化Quickpool代码的范围,不依赖于绕过要求:

  • 易于集成的修复程序,只需要对源代码进行简单的全局搜索和替换。
  • 具有不同大小的块(块数#)和不同大小的块的能力。
  • 如果快速池无法满足请求,则
  • 直至malloc

4 个答案:

答案 0 :(得分:4)

如果你能够花费一点内存(比如每个块的指针),那么你可以将空闲块保留在LIFO堆栈中并消除qp_malloc()和qp_free()中的搜索。它确实使代码稍微复杂一点,但确保所有分配的时间为O(1)。

答案 1 :(得分:4)

我在你的版本上获得了很好的加速(在我的硬件上大约25%),同时通过使用免费列表而不是免费地图来维护现有界面。

作为奖励,代码甚至变得更简单:

#include <string.h>
#include "quickpool.h"

// Various global values.

#define DEFLT_BLKS 1024
#define DEFLT_SZ    128

struct qp_pool_s {
    char *data;         // Actual blocks.
    char *enddata;      // First byte beyond data.
    size_t blk_sz;      // Size of each block.
    void *next_free;    // Next free block
};

qp_pool *qp_create (size_t quant, size_t sz)
{
    char *blk;

    // Zero means a default size.  sizeof(void *) is minimum block size.

    if (quant == 0) quant = DEFLT_BLKS;
    if (sz == 0)
        sz = DEFLT_SZ;

    /* Round up size to a multiple of sizeof(void *) */
    sz = (sz + sizeof(void *) - 1) & ~(sizeof(void *) - 1);

    // Allocate memory blocks for pool.

    qp_pool *pool = malloc (sizeof (*pool));
    if (pool == NULL) return NULL;

    if ((pool->data = malloc (quant * sz)) == NULL) {
            free (pool);
            return NULL;
    }

    /* Set up free chain */
    for (blk = pool->data; blk < &pool->data[(quant - 1) * sz]; blk += sz)
            *(void **)blk = blk + sz;
    *(void **)blk = NULL;
    pool->next_free = pool->data;

    // Set information on pool and return it.

    pool->enddata = &(pool->data[quant * sz]);
    pool->blk_sz = sz;

    return pool;
}

qp_pool *qp_destroy (qp_pool *pool) {
    // Just free all the memory for pool.

    if (pool != NULL) {
            free (pool->data);
            free (pool);
    }

    return NULL;
}

void *qp_malloc (qp_pool *pool, size_t sz) {
    void *blk;

    // If no pool, need more than BUFSZ bytes or pool empty, use default.

    if (pool == NULL) return malloc (sz);

    if ((sz > pool->blk_sz) || (pool->next_free == NULL))
            return malloc (sz);

    // Otherwise, get from quickpool. First we find a free block.
    blk = pool->next_free;

    // Then we mark it used.
    pool->next_free = *(void **)blk;

    return blk;
}

void *qp_free (qp_pool *pool, void *ptr) {

    // No pool created, use default.

    if (pool == NULL) {
            free (ptr);
            return NULL;
    }

    // Not in quick pool, use default.

    if (((char*)ptr < pool->data) || ((char*)ptr >= pool->enddata)) {
            free (ptr);
            return NULL;
    }

    // This is a quickpool address, free it.
    *(void **)ptr = pool->next_free;
    pool->next_free = ptr;

    return NULL;
}

结果(我的系统malloc显然非常快):

        Normal       Quick         Caf
    ==========  ==========  ==========
00         210         140         100
01         130         140         100
02         130         130         100
03         130         140         100
04         130         130         100
05         130         130          90
06         130         140         100
07         130         130         100
08         130         140         100
09         130         140         100
10         120         140         100
11         120         140         100
12         130         130         100
13         130         140         100
14         120         130         110
15         130         140         100
16         130         130         100
17         130         140         100
18         130         130         100
19         130         140         100
    ==========  ==========  ==========
sum       2650        2720        2000
                (    102%)  (     75%)

答案 2 :(得分:2)

您可以将已使用/未使用的块标志存储为字节数组中的单个位 - 假设32位系统可以将32位存储为32位。

然后找到一个备用区块只需走过32个英尺就能找到一个不是0xffffffff

答案 3 :(得分:1)

此时我认为您应该使用代码分析工具(例如gprof)来确定新代码的哪些部分确实花费了您最多的时间。也许值得在整个程序中运行一个配置文件来确定在哪里花时间进行优化。