对问题的一点介绍,
在我发布之前我尝试在谷歌/堆栈上搜索这个,但大多数都不清楚
我有一个基于cortex-a8的主板,我正在运行裸机RTOS,显示器(帧缓冲器)有点慢,因为我现在还没有为我的目标实现DMA,但是并不是那么慢,但是我注意到了改善的机会。在我的CPU和工具链组合中,32位数学,数据访问比16位访问快,显示为16位rgb565,因此一些帧缓冲操作比它们本来的慢一点(有些使用memcpy,memmove)和memset负责数据对齐等。)
我尝试将两个像素塞入一个32位数据类型并使用它来访问内存(据我所记得的那样对齐,即使没有,我的cpu支持在硬件中进行内存访问,所以问题不应该是这个。)请注意,我不是在谈论我的实现速度,而是我得到的一个奇怪的效果,我怀疑是因为我如何将两个像素塞进一个32位数据类型。
这是我fb_putc的大部分内容
if (((unsigned char)c > 32) && ((unsigned char) c < 127)) {
check_for_scroll(49);
// fontdata starts from ASCII 33 shifted by logarithm(base2, font_height)
c -= 33;
c <<= 4;
uint16_t pallete_16[2] = {fb.fg_color, fb.tg_color};
uint32_t y;
uint32_t *pixel_32;
uint32_t fb_shifter;
uint32_t pixel_32_holder;
uint32_t fb_bg_32 = ((pallete_16[1] << 16) | (pallete_16[1]));
/*
* Each pixel is 16 bits, we access them using 32 bit data type,
* which is faster for aligned memory access. Also many architectures
* have free bit shifts with each instruction so we use that too.
*/
pixel_32 = (uint32_t *) fb.config->base;
pixel_32 += ( ((fb.cursor.y * (FONT_HEIGHT * fb.config->width)) + ((fb.cursor.x * (FONT_WIDTH))))
/ ((sizeof(uint32_t))/(sizeof(uint16_t))) );
for (y = 0; y < 16; y++) {
for ( unsigned x = 7; x >= 0; x -= 2 )
{
if (fontdata[c + y] & (1 << x)) {
pixel_32_holder = (pallete_16[0] << 16);
} else {
pixel_32_holder = (pallete_16[1] << 16);
}
if (fontdata[c + y] & (1 << (x -1))) {
pixel_32_holder |= (pallete_16[0] & 0xffff);
} else {
pixel_32_holder |= (pallete_16[1] & 0xffff);
}
*pixel_32++ = pixel_32_holder;
}
// Panel stride = width (480) - font_width (8)
pixel_32 += (472 / ((sizeof(uint32_t))/(sizeof(uint16_t))));
}
fb.cursor.x++;
}
关于我哪里出错的任何帮助?我对编程有点新意,并且正在做这个爱好。
答案 0 :(得分:1)
在将它们写入内存之前组合2个像素的想法是正确的。 ARM的写缓冲区硬件将以这种方式更有效地使用,代码运行得更快。我不认为以这种形式混合C和ASM会产生最好的结果。坚持使用纯ASM将保证您使用有条件执行的指令。此外,为调色板使用数组可能会导致编译器输出效率非常低的代码。这是一种在纯ASM中更有效地完成它的方法。展开循环是个好主意。这是处理双字体字体数据的每个字节的代码。
@ Register usage
@ R0 = source data pointer
@ R1 = destination data pointer
@ R2 = foreground color (loaded outside of loop)
@ R3 = background color (loaded outside of loop)
@ R4,R5 = temp registers
@ Assumes that the most significant short of each 32-bit word is on the left
ldrb r4,[r0],#1 @ source bitonal image data
@ first pair of pixels
tst r4,#0x80
movne r5,r5,r2,LSL #16
moveq r5,r5,r3,LSL #16
tst r4,#0x40
orrne r5,r5,r2
orreq r5,r5,r3
str r5,[r1],#4
@ second pair of pixels
tst r4,#0x20
movne r5,r5,r2,LSL #16
moveq r5,r5,r3,LSL #16
tst r4,#0x10
orrne r5,r5,r2
orreq r5,r5,r3
str r5,[r1],#4
@ third pair of pixels
tst r4,#0x8
movne r5,r5,r2,LSL #16
moveq r5,r5,r3,LSL #16
tst r4,#0x4
orrne r5,r5,r2
orreq r5,r5,r3
str r5,[r1],#4
@ fourth pair of pixels
tst r4,#0x2
movne r5,r5,r2,LSL #16
moveq r5,r5,r3,LSL #16
tst r4,#0x1
orrne r5,r5,r2
orreq r5,r5,r3
str r5,[r1],#4
更新稍微简单的代码
答案 1 :(得分:0)
在编译器过去几个小时后,我已经使用asm修复了一次存储两个像素的事情,但现在看来还有一些其他问题,因为除了少数几个字符外,字符显得乱码,我是不确定世界上是什么导致了这个...
至于包装像素,这是我最终使用的东西(只是在将来可能需要这样做)
if (((unsigned char)c > 32) && ((unsigned char) c < 127)) {
check_for_scroll(FB_MAX_Y_UNDER);
uint32_t pixel_32_tmp;
uint16_t pallete[2] = { (fb.fg_color), (fb.tg_color)};
uint32_t *pixel_32 = (uint32_t *)fb.base +((((fb.cursor.y << 13)-(fb.cursor.y << 9))+(fb.cursor.x << 3)) >> 1);
c -= 32;
c <<= 4;
for (int y = 0; y < 16; y++) {
unsigned char font_bits = fontdata[c + y];
if (font_bits & 0x80) {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
if (font_bits & 0x40) {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
*pixel_32++ = pixel_32_tmp;
if (font_bits & 0x20) {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
if (font_bits & 0x10) {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
*pixel_32++ = pixel_32_tmp;
if (font_bits & 0x08) {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
if (font_bits & 0x04) {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
*pixel_32++ = pixel_32_tmp;
if (font_bits & 0x02) {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("mov %0, %1, lsl $16" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
if (font_bits & 0x01) {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[0]));
} else {
__asm__ volatile("orr %0, %0, %1" : "=r" (pixel_32_tmp) : "r" (pallete[1]));
}
*pixel_32++ = pixel_32_tmp;
pixel_32 += 236;
}
fb.cursor.x++;
}