我正在为每秒产生大约1GB数据的设备编写驱动程序。因此我决定将应用程序直接分配的用户缓冲区映射为DMA而不是通过中间内核缓冲区进行复制。
代码或多或少有效。但是在长期压力测试期间,我看到内核oops具有由不相关的应用程序(例如updatedb
)启动的“基页状态”,可能是在内核想要交换某些页面的时候:
[21743.515404] BUG: Bad page state in process PmStabilityTest pfn:357518
[21743.521992] page:ffffdf844d5d4600 count:19792158 mapcount:0 mapping: (null) index:0x12b011e012d0132
[21743.531829] flags: 0x119012c01220124(referenced|lru|slab|reclaim|uncached|idle)
[21743.539138] raw: 0119012c01220124 0000000000000000 012b011e012d0132 012e011e011e0111
[21743.546899] raw: 0000000000000000 012101300131011c 0000000000000000 012101240123012b
[21743.554638] page dumped because: page still charged to cgroup
[21743.560383] page->mem_cgroup:012101240123012b
[21743.564745] bad because of flags: 0x120(lru|slab)
[21743.569555] BUG: Bad page state in process PmStabilityTest pfn:357519
[21743.576098] page:ffffdf844d5d4640 count:18219302 mapcount:18940179 mapping: (null) index:0x0
[21743.585318] flags: 0x0()
[21743.587859] raw: 0000000000000000 0000000000000000 0000000000000000 0116012601210112
[21743.595599] raw: 0000000000000000 011301310127012f 0000000000000000 012f011d010d011a
[21743.603336] page dumped because: page still charged to cgroup
[21743.609108] page->mem_cgroup:012f011d010d011a
...
Entering kdb (current=0xffff8948189b2d00, pid 6387) on processor 6 Oops: (null)
due to oops @ 0xffffffff9c87f469
CPU: 6 PID: 6387 Comm: updatedb.mlocat Tainted: G B OE 4.10.0-42-generic #46~16.04.1-Ubuntu
...
详细说明:
用户缓冲区由帧组成,缓冲区和帧都不是页面对齐的。缓冲区中的帧以循环方式用于“无限”实时数据传输。对于每个帧,我通过get_user_pages_fast
获取内存页面,然后将其转换为带有sg_alloc_table_from_pages
的scatter-gatter表,最后使用dma_map_sg
映射到DMA。
我依靠sg_alloc_table_from_pages
将连续页面绑定到一个DMA描述符中,以减少发送到设备的S / G表的大小。器件是定制的,并使用FPGA。我从许多做类似映射的驱动程序中获取灵感,特别是视频驱动程序i915和radeon,但是没有人在一个地方拥有所有东西,所以我可能会忽略一些东西。
相关函数(pin_user_buffer
和unpin_user_buffer
在不同的IOCTL上调用):
static int pin_user_frame(struct my_dev *cam, struct udma_frame *frame)
{
const unsigned long bytes = cam->acq_frame_bytes;
const unsigned long first =
( frame->uaddr & PAGE_MASK) >> PAGE_SHIFT;
const unsigned long last =
((frame->uaddr + bytes - 1) & PAGE_MASK) >> PAGE_SHIFT;
const unsigned long offset =
frame->uaddr & ~PAGE_MASK;
int nr_pages = last - first + 1;
int err;
int n;
struct page **pages;
struct sg_table *sgt;
if (frame->uaddr + bytes < frame->uaddr) {
pr_err("%s: attempted user buffer overflow!\n", __func__);
return -EINVAL;
}
if (bytes == 0) {
pr_err("%s: user buffer has zero bytes\n", __func__);
return -EINVAL;
}
pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
if (!pages) {
pr_err("%s: can't allocate udma_frame.pages\n", __func__);
return -ENOMEM;
}
sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
if (!sgt) {
pr_err("%s: can't allocate udma_frame.sgt\n", __func__);
err = -ENOMEM;
goto err_alloc_sgt;
}
/* (rw == READ) means read from device, write into memory area */
err = get_user_pages_fast(frame->uaddr, nr_pages, READ == READ, pages);
if (err < nr_pages) {
nr_pages = err;
if (err > 0) {
pr_err("%s: can't pin all %d user pages, got %d\n",
__func__, nr_pages, err);
err = -EFAULT;
} else {
pr_err("%s: can't pin user pages\n", __func__);
}
goto err_get_pages;
}
for (n = 0; n < nr_pages; ++n)
flush_dcache_page(pages[n]); //<--- Is this needed?
err = sg_alloc_table_from_pages(sgt, pages, nr_pages, offset, bytes,
GFP_KERNEL);
if (err) {
pr_err("%s: can't build sg_table for %d pages\n",
__func__, nr_pages);
goto err_alloc_sgt2;
}
if (!dma_map_sg(&cam->pci_dev->dev, sgt->sgl, sgt->nents, DMA_FROM_DEVICE)) {
pr_err("%s: can't map %u sg_table entries for DMA\n",
__func__, sgt->nents);
err = -ENOMEM;
goto err_dma_map;
}
frame->pages = pages;
frame->nr_pages = nr_pages;
frame->sgt = sgt;
return 0;
err_dma_map:
sg_free_table(sgt);
err_alloc_sgt2:
err_get_pages:
for (n = 0; n < nr_pages; ++n)
put_page(pages[n]);
kfree(sgt);
err_alloc_sgt:
kfree(pages);
return err;
}
static void unpin_user_frame(struct my_dev *cam, struct udma_frame *frame)
{
int n;
dma_unmap_sg(&cam->pci_dev->dev, frame->sgt->sgl, frame->sgt->nents,
DMA_FROM_DEVICE);
sg_free_table(frame->sgt);
kfree(frame->sgt);
frame->sgt = NULL;
for (n = 0; n < frame->nr_pages; ++n) {
struct page *page = frame->pages[n];
set_page_dirty_lock(page);
mark_page_accessed(page); //<--- Without this the Oops are more frequent
put_page(page);
}
kfree(frame->pages);
frame->pages = NULL;
frame->nr_pages = 0;
}
static void unpin_user_buffer(struct my_dev *cam)
{
if (cam->udma_frames) {
int n;
for (n = 0; n < cam->udma_frame_count; ++n)
unpin_user_frame(cam, &cam->udma_frames[n]);
kfree(cam->udma_frames);
cam->udma_frames = NULL;
}
cam->udma_frame_count = 0;
cam->udma_buffer_bytes = 0;
cam->udma_buffer = NULL;
cam->udma_desc_count = 0;
}
static int pin_user_buffer(struct my_dev *cam)
{
int err;
int n;
const u32 acq_frame_count = cam->acq_buffer_bytes / cam->acq_frame_bytes;
struct udma_frame *udma_frames;
u32 udma_desc_count = 0;
if (!cam->acq_buffer) {
pr_err("%s: user buffer is NULL!\n", __func__);
return -EFAULT;
}
if (cam->udma_buffer == cam->acq_buffer
&& cam->udma_buffer_bytes == cam->acq_buffer_bytes
&& cam->udma_frame_count == acq_frame_count)
return 0;
if (cam->udma_buffer)
unpin_user_buffer(cam);
udma_frames = kcalloc(acq_frame_count, sizeof(*udma_frames),
GFP_KERNEL | __GFP_ZERO);
if (!udma_frames) {
pr_err("%s: can't allocate udma_frame array for %u frames\n",
__func__, acq_frame_count);
return -ENOMEM;
}
for (n = 0; n < acq_frame_count; ++n) {
struct udma_frame *frame = &udma_frames[n];
frame->uaddr =
(unsigned long)(cam->acq_buffer + n * cam->acq_frame_bytes);
err = pin_user_frame(cam, frame);
if (err) {
pr_err("%s: can't pin frame %d (out of %u)\n",
__func__, n + 1, acq_frame_count);
for (--n; n >= 0; --n)
unpin_user_frame(cam, frame);
kfree(udma_frames);
return err;
}
udma_desc_count += frame->sgt->nents; /* Cannot overflow */
}
pr_debug("%s: total udma_desc_count=%u\n", __func__, udma_desc_count);
cam->udma_buffer = cam->acq_buffer;
cam->udma_buffer_bytes = cam->acq_buffer_bytes;
cam->udma_frame_count = acq_frame_count;
cam->udma_frames = udma_frames;
cam->udma_desc_count = udma_desc_count;
return 0;
}
相关结构:
struct udma_frame {
unsigned long uaddr; /* User address of the frame */
int nr_pages; /* Nr. of pages covering the frame */
struct page **pages; /* Actual pages covering the frame */
struct sg_table *sgt; /* S/G table describing the frame */
};
struct my_dev {
...
u8 __user *acq_buffer; /* User-space buffer received via IOCTL */
...
u8 __user *udma_buffer; /* User-space buffer for image */
u32 udma_buffer_bytes; /* Total image size in bytes */
u32 udma_frame_count; /* Nr. of items in udma_frames */
struct udma_frame
*udma_frames; /* DMA descriptors per frame */
u32 udma_desc_count; /* Total nr. of DMA descriptors */
...
};
问题:
DMA_FROM_DEVICE
或更确切地说
使用DMA_BIDIRECTIONAL
以防万一?SetPageReserved
/ ClearPageReserved
或mark_page_reserved
/ free_reserved_page
之类的内容?sg_alloc_table_from_pages
?set_page_dirty
,set_page_dirty_lock
和SetPageDirty
之间的区别是什么?感谢任何提示。
PS:我不能改变应用程序获取数据的方式,而不会破坏我们多年来维护的库API。所以请不要建议,例如到mmap
内核缓冲区...