我正在帮助我的朋友为这个程序优化他的记忆力:
#define _GNU_SOURCE
#include "myutils.h"
#include <pthread.h>
void * mySPMDMain(void *);
pthread_barrier_t mybarrier;
int main (int argc, char ** argv) {
/* Initialize global data here */
/* Start threads*/
int i;
if (argc != 3)
{
fprintf(stderr, "usage: ./psrs <Number of Keys> <Number of threads>\n");
exit(EXIT_FAILURE);
}
struct timeval start;
struct timeval end;
int N = atoi(argv[1]);
int NUM_THREADS = atoi(argv[2]);
int allChunkSize = N/NUM_THREADS;
pthread_t ThreadID[NUM_THREADS];
pthread_barrier_init(&mybarrier, NULL, NUM_THREADS);
long int * originArray = (long int *) malloc(sizeof(long int)*N);
for (i = 0; i<N; ++i)
{
originArray[i] = random() % 100000;
}
/*Initialize the Data Space*/
TCB *myTCB = (TCB *) malloc(sizeof(TCB) *NUM_THREADS);
for (i = 0; i<NUM_THREADS; ++i)
{
myTCB[i].Chunk = (long int*) malloc(sizeof(long int)*allChunkSize);
myTCB[i].passLength = (int*) malloc(sizeof(int)*NUM_THREADS);
myTCB[i].samples = (long int*) malloc(sizeof(long int)*NUM_THREADS);
myTCB[i].tmpMergeSpace = (long int**) malloc(sizeof(long int *)*NUM_THREADS);
myTCB[i].pivotArray = (long int *) malloc(sizeof(long int)*NUM_THREADS*NUM_THREADS);
myTCB[i].selectedPivot = (long int*) malloc(sizeof(long int)*(NUM_THREADS-1));
myTCB[i].eachStartIndex = (int*) malloc(sizeof(int)*NUM_THREADS);
myTCB[i].mergeLength = (int *) malloc(sizeof(int)*NUM_THREADS);
myTCB[i].sampleIndex = calloc((NUM_THREADS-1), sizeof(int));
memcpy(myTCB[i].Chunk, &originArray[i*allChunkSize],allChunkSize*sizeof(long int));
myTCB[i].N = N;
myTCB[i].num_threads = NUM_THREADS;
myTCB[i].ChunkSize = allChunkSize;
myTCB[i].offSet = N/(NUM_THREADS * NUM_THREADS);
}
gettimeofday(&start, NULL);
for (i = 1; i < NUM_THREADS; ++i)
{
myTCB[i].pid = i;
pthread_create(&(ThreadID[i]),NULL, mySPMDMain, (void*) &(myTCB[i]));
}
myTCB[0].pid = 0;
mySPMDMain((void *) &(myTCB[0]));
for (i = 1; i<NUM_THREADS; ++i)
{
pthread_join(ThreadID[i], NULL);
}
gettimeofday(&end, NULL);
double time_spent = (double)(end.tv_sec - start.tv_sec) * 1.0e6 + (double) (end.tv_usec - start.tv_usec);
time_spent = time_spent /1000000;
//printf("time spent: %f\n", time_spent);
memset(originArray, 0, N*sizeof(long int));
int cursor = 0;
for (i = 0; i<NUM_THREADS; ++i)
{
memcpy(&originArray[cursor], myTCB[i].resultArr, sumTotal(myTCB[i].mergeLength, NUM_THREADS) * sizeof(long int));
cursor = cursor+sumTotal(myTCB[i].mergeLength, NUM_THREADS);
}
/* Clean up and exit*/
pthread_barrier_destroy(&mybarrier);
assert(isSorted(originArray,N) == 1);
for (i = 0; i<NUM_THREADS; ++i)
{
free(myTCB[i].mergeLength);
free(myTCB[i].eachStartIndex);
free(myTCB[i].sampleIndex);
free(myTCB[i].selectedPivot);
free(myTCB[i].pivotArray);
free(myTCB[i].samples);
free(myTCB[i].passLength);
free(myTCB[i].Chunk);
free(myTCB[i].resultArr);
for(int j = 0; j< NUM_THREADS; j++)
{
free(myTCB[i].tmpMergeSpace[j]);
}
free(myTCB[i].tmpMergeSpace);
}
free(myTCB);
free(originArray);
return 0;
}
#define MASTER if(localId == 0)
#define BARRIER pthread_barrier_wait(&mybarrier)
void * mySPMDMain(void *arg)
{
TCB * localTCB;
int localId;
/* Actual parameter */
localTCB = (TCB *)arg;
/* Other parameters passed in via global */
localId = localTCB -> pid;
/* Parallel array to TCB */
qsort(localTCB -> Chunk, localTCB -> ChunkSize, sizeof(long int), cmpfunc);
BARRIER;
// Timing
/* Phase 1 */
for (int i = 0; i< localTCB -> num_threads; ++i)
{
long int sample = localTCB->Chunk[i*localTCB -> offSet];
localTCB->samples[i]=sample;
}
BARRIER;
/* Phase 2 */
MASTER {
for (int i = 0; i < localTCB -> num_threads; ++i)
{
memcpy(&(localTCB->pivotArray[i*localTCB -> num_threads]),localTCB[i].samples, localTCB -> num_threads*sizeof(long int));
}
qsort(localTCB -> pivotArray, localTCB -> num_threads*localTCB -> num_threads, sizeof(long int), cmpfunc);
for (int i = 0; i<(localTCB -> num_threads)-1; ++i)
{
localTCB-> selectedPivot[i] = localTCB-> pivotArray[(i+1)*localTCB -> num_threads];
}
for (int i = 1; i<localTCB -> num_threads; ++i)
{
memcpy(localTCB[i].selectedPivot,localTCB[0].selectedPivot,((localTCB -> num_threads)-1)*sizeof(long int));
}
}
BARRIER;
/* Phase 3 */
for (int i = 0; i<(localTCB -> num_threads)-1; ++i)
{
localTCB -> sampleIndex[i] = binarySearch(localTCB->Chunk,0,localTCB->ChunkSize -1, localTCB->selectedPivot[i]);
}
for (int i = 0; i<localTCB -> num_threads; ++i)
{
if(i == 0) localTCB -> eachStartIndex[i] = 0;
else localTCB -> eachStartIndex[i] = localTCB -> sampleIndex[i-1];
}
for (int i = 0; i<localTCB -> num_threads; ++i)
{
if (i == 0) localTCB -> passLength[i] = localTCB -> sampleIndex[i];
localTCB -> passLength[i] = localTCB -> sampleIndex[i] - localTCB -> sampleIndex[i-1];
if (i == (localTCB -> num_threads)-1) localTCB -> passLength[i] = localTCB -> ChunkSize - localTCB -> sampleIndex[i-1];
}
BARRIER;
/* Phase 4 */
MASTER {
for (int i = 0; i<localTCB -> num_threads; ++i)
{
for (int j = 0; j<localTCB -> num_threads; ++j)
{
// each temp merge space = localTCB[i].tmpMergeSpace[j]
// each cpy start = localTCB[j].Chunk[localTCB[j].eachStartIndex[i]]
// each cpy length = localTCB[j].passLength[i] * sizeof(long int)
localTCB[i].tmpMergeSpace[j] = (long int*) malloc(sizeof(long int)*localTCB[j].passLength[i]);
localTCB[i].mergeLength[j] = localTCB[j].passLength[i];
memcpy(localTCB[i].tmpMergeSpace[j], &(localTCB[j].Chunk[localTCB[j].eachStartIndex[i]]), (localTCB[j].passLength[i]) * sizeof(long int));
}
localTCB[i].resultArr = (long int *) malloc(sizeof(long int)*sumTotal(localTCB[i].mergeLength, localTCB -> num_threads));
}
}
BARRIER;
multimerge(localTCB->tmpMergeSpace,localTCB->mergeLength,localTCB -> num_threads,localTCB->resultArr);
BARRIER;
//Timing
return NULL;
} /* mySPMDMain*/
和TCB结构是这样的:
typedef struct ThreadControlBlock {
long int *Chunk;
int *passLength;
int *sampleIndex;
int *eachStartIndex;
long int **tmpMergeSpace;
long int *pivotArray;
long int *selectedPivot;
long int *samples;
int *mergeLength;
long int * resultArr;
int pid;
int N;
int num_threads;
int ChunkSize;
int offSet;
} TCB;
我使用valgrind
检查是否有任何内存泄漏。 valgrind
返回该程序在执行后释放内存,但在主线程上执行线程函数(pid==0
)时读取大小为4:
==6150== Memcheck, a memory error detector
==6150== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
==6150== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
==6150== Command: ./psrs 8000000 8
==6150==
==6150== Thread 8:
==6150== Invalid read of size 4
==6150== at 0x400DC2: mySPMDMain (psrs.c:169)
==6150== by 0x4E3ADC4: start_thread (in /usr/lib64/libpthread-2.17.so)
==6150== by 0x514673C: clone (in /usr/lib64/libc-2.17.so)
==6150== Address 0x911cd5c is 4 bytes before a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150== Invalid read of size 4
==6150== at 0x400DBE: mySPMDMain (psrs.c:169)
==6150== by 0x4E3ADC4: start_thread (in /usr/lib64/libpthread-2.17.so)
==6150== by 0x514673C: clone (in /usr/lib64/libc-2.17.so)
==6150== Address 0x911cd7c is 0 bytes after a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150== Thread 1:
==6150== Invalid read of size 4
==6150== at 0x400DC2: mySPMDMain (psrs.c:169)
==6150== by 0x401266: main (psrs.c:64)
==6150== Address 0x911a89c is 4 bytes before a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150== Invalid read of size 4
==6150== at 0x400DBE: mySPMDMain (psrs.c:169)
==6150== by 0x401266: main (psrs.c:64)
==6150== Address 0x911a8bc is 0 bytes after a block of size 28 alloc'd
==6150== at 0x4C29975: calloc (vg_replace_malloc.c:711)
==6150== by 0x401188: main (psrs.c:48)
==6150==
==6150==
==6150== HEAP SUMMARY:
==6150== in use at exit: 0 bytes in 0 blocks
==6150== total heap usage: 174 allocs, 174 frees, 320,014,664 bytes allocated
==6150==
==6150== All heap blocks were freed -- no leaks are possible
==6150==
==6150== For counts of detected and suppressed errors, rerun with: -v
==6150== ERROR SUMMARY: 16 errors from 4 contexts (suppressed: 0 from 0)
第169行:localTCB -> passLength[i] = localTCB -> sampleIndex[i] - localTCB -> sampleIndex[i-1];
第63行:myTCB[0].pid = 0;
第48行:myTCB[i].sampleIndex = calloc((NUM_THREADS-1), sizeof(int));
在此次测试中,NUM_THREADS
为8
。如果读取正确,myTCB[i].sampleIndex = calloc((NUM_THREADS-1), sizeof(int));
应读取4 *(8-1)= 28字节的内存,在这28字节块之前,我不知道这4字节无效读取的位置。
答案 0 :(得分:2)
您的第169行位于for循环中,{0}从{0}向上进行迭代,但您正在访问元素i
。在第一次迭代中,此访问超出范围。
答案 1 :(得分:1)
@Honza是正确的,请看看这些行:
for (int i = 0; i<localTCB->num_threads; ++i)
{
if (i == 0) localTCB->passLength[i] = localTCB->sampleIndex[i];
localTCB->passLength[i] = localTCB->sampleIndex[i] - localTCB->sampleIndex[i-1];
if (i == (localTCB->num_threads)-1) localTCB->passLength[i] = localTCB->ChunkSize - localTCB->sampleIndex[i-1];
}
这里的分支是不安全的。当i == 0
时,肯定会执行if (i ==0)
条件下的行。但这个循环将始终执行中间线!然后localTCB->sampleIndex[i-1]
将从int
读取localTCB->sampleIndex[-1]
,这是整个sampleIndex
内存之前的int字节。由于此字节也是0
(未使用或填充),程序的结果仍然正确,但显然是正确结果的“错误方式”。 valgrind
严格,会抱怨。当i == (localTCB->num_threads)-1
时,中间线也会得到错误的值(0
),但最终结果将由最后一个条件更正。这意味着分支既不正确又有效。怎么改呢?简单,规范每一个条件应该是好的:
for (int i = 0; i<localTCB->num_threads; ++i)
{
if (i == 0) localTCB->passLength[i] = localTCB->sampleIndex[i];
else if (i > 0 && i<(localTCB->num_threads)-1) localTCB->passLength[i] = localTCB->sampleIndex[i] - localTCB->sampleIndex[i-1];
else if (i == (localTCB->num_threads)-1) localTCB->passLength[i] = localTCB->ChunkSize - localTCB->sampleIndex[i-1];
}