我尝试使用C中的clock()函数来获取简单代码的近似执行时间。我使用16MHz流水线芯片,因此理想情况下的吞吐量为1.
void main(){
short i1, i2, sum = 0;
clock_t start = clock();
for(i1 = 0; i1 < 128; i1++)
{
for(i2 = 0; i2 < 128; i2++)
{
sum += (short)((i1+ i2) + 4);
}
}
clock_t end = clock();
double Num_Of_Clocks = end - start;
double Execution_Time = (Num_Of_Clocks/CLOCK_PER_SECOND);
printf("Exectuion Time is : %2.10f seconds and Number of Clocks: %2.3f", Execution_Time, Num_Of_Clocks);
}
结果是:
Execution Time is : 0.0000043125 and Number of Clocks: 69.000
是否可以在69个时钟周期内完成大约81000(128 * 128 * 5)的操作? Clock()函数运作良好吗?
程序集输出为:
_main:
push (r4,r5,lr)
sub sp, #8
jsr _clock
mov r4, r0
mov r5, r1
mov r0, #0
mov r1, #0
.Llabel.3.1:
cmp ge, r1, #128
jpt .Llabel.3.3
mov r2, #0
.Llabel.3.4:
cmp ge, r2, #128
jpt .Llabel.3.6
mov r3, r1
add r3, r2
add r3, #4
add r0, r3
.Llabel.3.5:
add r2, #1
jmp .Llabel.3.4
.Llabel.3.6:
.Llabel.3.2:
add r1, #1
jmp .Llabel.3.1
.Llabel.3.3:
jsr _clock
sub r1, r5
sbc r0, r4
jsr $___floatsisf
mov r4, r0
mov r5, r1
mov a13, #___stemp1
ldw r2, @[a13 + 0]
ldw r3, @[a13 + 2]
mov r10, r2
mov r11, r3
mov r0, r4
mov r1, r5
jsr $___divsf3
ldw @[sp + 2], r0
ldw @[sp + 4], r1
ldw @[sp + 6], r4
ldw @[sp + 8], r5
mov a10, #___str_0
jsr _printf
mov r0, #0
jmp .Llabel.3.7
jmp .Llabel.3.7
.Llabel.3.7:
add sp, #8
pop_ret (r4,r5,lr)