我正在编写RDMA(InfiniBand)内核模块。
到目前为止,我已成功创建保护域,发送和接收队列的完成队列。
但是每当我尝试通过调用ib_create_qp来创建队列对时,它都无法创建队列对。我写的代码如下所示:
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"
struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/
struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
void myClient_ib_recvcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
void myClient_ib_sendcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}
static void myClient_add_one(struct ib_device *device)
{
union ib_gid tmp_gid;
int ret;
int hcaport = 1;
int result = -ENOMEM;
u16 port1Pkey;
struct ib_port_attr attr;
ret = ib_query_port(device,hcaport,&attr);
printk("ib query port result %d \n", ret);
// Creating the Protection Domain for RDMA
mypd = ib_alloc_pd(device);
if(IS_ERR(mypd)){
printk(KERN_INFO "Failed to allocate PD\n");
return;
}
else{
printk(KERN_INFO "1Successfully allocated the PD\n");
pdset = true;
}
// Creating the receive completion queue for RDMA
myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
if(IS_ERR(myrcvcq)){
pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
}
// Creating the send completion queue for RDMA
myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
if(IS_ERR(myClientsendcq)){
pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
}
// Creating the queue pair
// Creating the queue pair
struct ib_qp_init_attr init_qpattr;
memset(&init_qpattr,0,sizeof(init_qpattr));
init_qpattr.event_handler = myClient_qp_event_handler;
init_qpattr.cap.max_send_wr = 2;
init_qpattr.cap.max_recv_wr = 2;
init_qpattr.cap.max_recv_sge = 1;
init_qpattr.cap.max_send_sge = 1;
init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
init_qpattr.qp_type = IB_QPT_UD;
init_qpattr.send_cq = myClientsendcq;
init_qpattr.recv_cq = myrcvcq;
myClientqp = ib_create_qp(mypd,&init_qpattr);
if(IS_ERR(myClientqp)){
pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk(KERN_INFO "1The queue pair is successfully created \n");
qpcreated = true;
}
}
static void myClient_remove_one(struct ib_device *device)
{
}
static struct ib_client my_client = {
.name = "myRDMAclient",
.add = myClient_add_one,
.remove = myClient_remove_one
};
static int __init myRDMAclient_init(void)
{
int ret;
ret = ib_register_client(&my_client);
if(ret){
//printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
goto err_sa;
}
printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
return 0;
err_sa:
return ret;
}
module_init(myRDMAclient_init);
此处所有查询都有效,但ib_create_qp(mypd,&init_qpattr);
无法创建队列对。
更新:在创建队列对之前注册内存。但仍然显示ib_create_qp无效参数错误(错误代码-22)
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"
struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/
struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
struct ib_mr *mymr;
void myClient_ib_recvcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
void myClient_ib_sendcompletion(struct ib_cq *cq)
{
printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}
static void myClient_add_one(struct ib_device *device)
{
union ib_gid tmp_gid;
int ret;
int hcaport = 1;
int result = -ENOMEM;
u16 port1Pkey;
struct ib_port_attr attr;
ret = ib_query_port(device,hcaport,&attr);
printk("ib query port result %d \n", ret);
// Creating the Protection Domain for RDMA
mypd = ib_alloc_pd(device);
if(IS_ERR(mypd)){
printk(KERN_INFO "Failed to allocate PD\n");
return;
}
else{
printk(KERN_INFO "1Successfully allocated the PD\n");
pdset = true;
}
// Registering Memory
mymr = ib_get_dma_mr(mypd,IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE);
if(IS_ERR(mymr)){
printk("failed to register memory :( %d \n",PTR_ERR(mymr));
}else{
printk(KERN_INFO "Successfully registered memory region :) \n");
}
// End Registering Memory
// Creating the receive completion queue for RDMA
myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
if(IS_ERR(myrcvcq)){
pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
}
// Creating the send completion queue for RDMA
myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
if(IS_ERR(myClientsendcq)){
pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
}
// Creating the queue pair
// Creating the queue pair
struct ib_qp_init_attr init_qpattr;
memset(&init_qpattr,0,sizeof(init_qpattr));
init_qpattr.event_handler = myClient_qp_event_handler;
init_qpattr.cap.max_send_wr = 2;
init_qpattr.cap.max_recv_wr = 2;
init_qpattr.cap.max_recv_sge = 1;
init_qpattr.cap.max_send_sge = 1;
init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
init_qpattr.qp_type = IB_QPT_UD;
init_qpattr.send_cq = myClientsendcq;
init_qpattr.recv_cq = myrcvcq;
myClientqp = ib_create_qp(mypd,&init_qpattr);
if(IS_ERR(myClientqp)){
pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
//printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
}
else{
printk(KERN_INFO "1The queue pair is successfully created \n");
qpcreated = true;
}
}
static void myClient_remove_one(struct ib_device *device)
{
}
static struct ib_client my_client = {
.name = "myRDMAclient",
.add = myClient_add_one,
.remove = myClient_remove_one
};
static int __init myRDMAclient_init(void)
{
int ret;
ret = ib_register_client(&my_client);
if(ret){
//printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
goto err_sa;
}
printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
return 0;
err_sa:
return ret;
}
module_init(myRDMAclient_init);
答案 0 :(得分:3)
<强>更新强>:
根据以下评论中的讨论,我猜你在当前版本的基础上安装了Mellanox OFED驱动程序。看看Mellanox OFED内核驱动程序的3.1-1.0.3源代码,我看到他们通过添加一些字段来改变struct ib_qp_init_attr
的布局。我很确定您的问题是您正在针对原始SLE 3.0.76-0.11内核头文件构建模块,因此传递给create QP函数的init_qpattr
结构没有您设置的值在正确的地方。
我不知道您是如何安装新的树外驱动程序的,所以我无法准确地告诉您如何正确构建模块,但您可以尝试添加类似
的内容 init_qpattr.qpg_type = 0;
到你设置结构的地方。 (我知道你memset
已经将整个事情归零了,但是这将确保你构建的标题具有结构的新qpg_type
成员。我认为这是一个新的字段添加了OFED不在您的原始内核头文件中,因此如果您的模块编译,那么您正在构建正确的标头)
OLD ANSWER:
所以我怀疑你遇到了与创建这么小的QP(max_send_wr == max_recv_wr == 2
和max_send_sge == max_recv_sge == 1
)相关的mlx4驱动程序中的错误。我设法找到你正在使用的3.0.76-0.11内核的源代码,不幸的是我没有看到任何明显的错误。
您可以尝试帮助调试此内容
debug_level=1
到mlx4_core
模块。使用驱动程序初始化的所有输出更新您的问题(关于“最大CQE:”的一堆行等.mlx4驱动程序中有相当多的逻辑,它依赖于fimrware在初始化期间返回的参数,并且此输出将允许我们看看那些是什么。max_send_sge
和max_recv_sge
增加到2并将max_send_wr
和max_recv_wr
增加到32或128.(尝试单独增加或组合使用)set_rq_size()
失败,是set_kernel_sq_size()
还是失败?答案 1 :(得分:-1)
我想你忘了注册记忆区域。 在创建QP之前需要执行的操作是:
然后才创建QP。
我不知道你正在使用什么设备和免费软件,但是在Mellanox IB lib中它是:
char mr_buffer[REGION_SIZE];
//mypd its your protection domain that you allocated
struct ibv_mr *mr = ibv_reg_mr(mypd , mr_buffer, REGION_SIZE, 0);
if (!mr) {
//ERROR MSG
}