如何接收带有动词的以太网帧?

时间:2018-10-30 11:55:09

标签: c ethernet infiniband mellanox

我想编写一个简单的测试程序,以使用ibverbs API接收以太网帧。

下面的代码可以编译并运行,但从不接收任何数据包。我在Ubuntu 18上使用Mellanox ConnectX-3硬件。

问题:

  1. 如果在运行此RX程序时从另一台计算机ping Inifiniband接口,则ping会收到响应。我不希望这样,因为ping请求应该由RX程序捕获,并且Linux IP堆栈不应看到它们,因此不应响应。应该怎么办?

  2. 我的代码有明显的错误吗?

  3. 我需要指导规则吗?如果我删除对ibv_create_flow()的调用,我是否应该只接收接口看到的所有数据包?

#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>


#define PORT_NUM 1

#define MAX_MSG_SIZE 1500 // The maximum size of each received packet.
#define RQ_NUM_DESC 512 // Max packets that can be received without processing.

// The MAC of the interface we are listening on.
#define DEST_MAC { 0x00, 0x0d, 0x3a, 0x47, 0x1c, 0x2e }

#define FATAL_ERROR(msg, ...) { fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); exit(-1); }


int main() {
    // Get the list of devices.
    int num_devices = 0;
    struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
    if (!dev_list)
        FATAL_ERROR("Failed to get IB devices list.");

    // Choose the first device.
    struct ibv_device *ib_dev = dev_list[0];
    if (!ib_dev)
        FATAL_ERROR("IB device not found.");
    printf("Found %i Infiniband device(s).\n", num_devices);
    printf("Using device '%s'.\n", ibv_get_device_name(ib_dev));

    // Get the device context.
    struct ibv_context *context = ibv_open_device(ib_dev);
    if (!context)
        FATAL_ERROR("Couldn't get context for device.");

    // Allocate a protection domain (PD) that will group memory
    // regions (MR) and rings.
    struct ibv_pd *pd = ibv_alloc_pd(context);
    if (!pd)
        FATAL_ERROR("Couldn't allocate protection domain.");

    // Create Complition Queue (CQ).
    struct ibv_cq *cq = ibv_create_cq(context, RQ_NUM_DESC, NULL, NULL, 0);
    if (!cq)
        FATAL_ERROR("Couldn't create completion queue. errno = %d.", errno);

    // Create Queue Pair (QP).
    struct ibv_qp_init_attr qp_init_attr = {
        .qp_context = NULL,
        .send_cq = cq, // Report receive completion to CQ.
        .recv_cq = cq,

        .cap = {
            .max_send_wr = 0, // No send ring.
            .max_recv_wr = RQ_NUM_DESC, // Max num packets in ring.
            .max_recv_sge = 1, // Only one pointer per descriptor.
         },
        .qp_type = IBV_QPT_RAW_PACKET, // Use Ethernet packets.
    };
    struct ibv_qp *qp = ibv_create_qp(pd, &qp_init_attr);
    if (!qp)
        FATAL_ERROR("Couldn't create queue pair.");

    // Initialize the QP (receive ring) and assign a port.
    struct ibv_qp_attr qp_attr = { 0 };
    qp_attr.qp_state = IBV_QPS_INIT;
    qp_attr.port_num = PORT_NUM;
    int qp_flags = IBV_QP_STATE | IBV_QP_PORT;
    if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
        FATAL_ERROR("Failed to initialize queue pair.");

    // Move ring state to ready-to-receive. This is needed in
    // order to be able to receive packets.
    memset(&qp_attr, 0, sizeof(qp_attr));
    qp_flags = IBV_QP_STATE;
    qp_attr.qp_state = IBV_QPS_RTR;
    if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
        FATAL_ERROR("Failed to put queue pair into ready-to-receive state.");

    // Allocate memory for packet buffer.
    int buf_size = MAX_MSG_SIZE * RQ_NUM_DESC; // Maximum size of data to be accessed by hardware.
    void *buf = malloc(buf_size);
    if (!buf)
        FATAL_ERROR("Couldn't allocate memory.");

    // Register the user memory so it can be accessed by the HW directly.
    struct ibv_mr *mr = ibv_reg_mr(pd, buf, buf_size, IBV_ACCESS_LOCAL_WRITE);
    if (!mr)
        FATAL_ERROR("Couldn't register memory region.");

    // Create a scatter/gather entry.
    struct ibv_sge sg_entry;
    sg_entry.length = MAX_MSG_SIZE;
    sg_entry.lkey = mr->lkey;

    // Create a receive work request.
    struct ibv_recv_wr wr;
    wr.num_sge = 1;
    wr.sg_list = &sg_entry;
    wr.next = NULL;

    // Post a load of receive work requests onto the receive queue.
    struct ibv_recv_wr *bad_wr;
    for (int n = 0; n < RQ_NUM_DESC; n++) {
        // Each descriptor points to max MTU size buffer.
        sg_entry.addr = (uint64_t)buf + MAX_MSG_SIZE * n;

        // When a packet is received, a work completion will be created
        // corresponding to this work request. It will contain this field.
        wr.wr_id = n;

        // Post the receive buffer to the ring.
        int rv = ibv_post_recv(qp, &wr, &bad_wr);
        if (rv != 0) {
            FATAL_ERROR("Posting recv failed with error code %i.", rv);
        }
    }

    // Create steering rule.
    struct raw_eth_flow_attr {
        struct ibv_flow_attr attr;
        struct ibv_flow_spec_eth spec_eth;
    } __attribute__((packed)) flow_attr = {
        .attr = {
            .comp_mask = 0,
            .type = IBV_FLOW_ATTR_NORMAL,
            .size = sizeof(flow_attr),
            .priority = 0,
            .num_of_specs = 1,
            .port = PORT_NUM,
            .flags = 0,
        },
        .spec_eth = {
            .type = IBV_FLOW_SPEC_ETH,
            .size = sizeof(struct ibv_flow_spec_eth),
            .val = {
                .dst_mac = DEST_MAC,
                .src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
                .ether_type = 0,
                .vlan_tag = 0,
            },
            .mask = {
                .dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
                .src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
                .ether_type = 0,
                .vlan_tag = 0,
            }
        }
    };

    // Register steering rule to intercept packet to DEST_MAC and place packet in
    // ring pointed by qp.
    struct ibv_flow *eth_flow = ibv_create_flow(qp, &flow_attr.attr);
    if (!eth_flow)
        FATAL_ERROR("Couldn't attach steering flow. Does DEST_MAC match that of the local NIC?");

    printf("Receiving.\n");
    while (1) {
        // Wait for CQ event upon message received, and print a message
        struct ibv_wc wc;
        int msgs_completed = ibv_poll_cq(cq, 1, &wc);
        if (msgs_completed > 0) {
            printf("Message %ld received size %d\n", wc.wr_id, wc.byte_len);
            sg_entry.addr = (uint64_t)buf + wc.wr_id * MAX_MSG_SIZE;
            wr.wr_id = wc.wr_id;

            // After processed need to post back the buffer.
            int rv = ibv_post_recv(qp, &wr, &bad_wr);
            if (rv != 0) {
                FATAL_ERROR("Re-posting recv failed with error code %i.", rv);
            }
        }
        else if (msgs_completed < 0) {
            FATAL_ERROR("Polling error.");
        }
    }
}

2 个答案:

答案 0 :(得分:1)

看看Mellanox的以下示例:https://community.mellanox.com/s/article/raw-ethernet-programming--basic-introduction---code-example

要接收界面看到的所有内容,可以使用实验性的API #include <infiniband/verbs_exp.h>,然后在创建导向规则时使用ibv_exp_flow_attr并将类型设置为IBV_EXP_FLOW_ATTR_SNIFFER

答案 1 :(得分:0)

请参阅https://github.com/Mellanox/libvma/wiki/Architecture VMA实现本机RDMA动词API。本地RDMA谓词已扩展到支持以太网RDMA的NIC中,从而使数据包可以直接在用户应用程序和InfiniBand HCA或以太网NIC之间传递,而绕过内核及其TCP / UDP处理网络堆栈。