Question

我试图用libbpf加载内核中的eBPF对象，但没有成功，出现标题中指定的错误。但是，让我展示一下我的BPF *_kern.c有多简单。

SEC("entry_point_prog")
int entry_point(struct xdp_md *ctx)
{
    int act = XDP_DROP;
    int rc, i = 0;
    struct global_vars *globals;
    struct ip_addr addr = {};
    struct some_key key = {};
    void *temp;

    globals = bpf_map_lookup_elem(&globals_map, &i);
    if (!globals)
        return XDP_ABORTED;

    rc = some_inlined_func(ctx, &key);

    addr = key.dst_ip;
    temp = bpf_map_lookup_elem(&some_map, &addr);

    switch(rc)
    {
    case 0:
        if(temp)
        {
            // no rocket science here ...
        } else
            act = XDP_PASS;
        break;
    default:
        break;
    }

    return act;  // this gives the error
    //return XDP_<whatever>;  // this works fine
}

更准确地说，libbpf错误日志如下：

105: (bf) r4 = r0
106: (07) r4 += 8
107: (b7) r8 = 1
108: (2d) if r4 > r3 goto pc+4
 R0=inv40 R1=inv0 R2=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3=pkt_end(id=0,off=0,imm=0) R4=inv48 R5=inv512 R6=inv1 R7=inv17 R8=inv1 R10=fp0,call_-1 fp-16=0 fp-32=0 fp-40=0
109: (69) r3 = *(u16 *)(r0 +2)
R0 invalid mem access 'inv'

我真的在这里看不到任何问题。我的意思是，这是如此简单，但是却中断了。为什么不行呢？我想念什么？验证者发疯了，或者我正在做一些非常愚蠢的事情。

Answer 1

好吧，所以，经过3天，更准确地说是3 x 8小时= 24小时，值得进行代码搜寻，我想我终于找到了发痒的问题。

问题一直存在于some_inlined_func()中，要棘手，然后要挑战。我在这里写下一个解释该问题的代码模板，以便其他人可以看到并希望花费不超过24小时的时间；我为此经历了地狱，所以要集中精力。

__alwais_inline static
int some_inlined_func(struct xdp_md *ctx, /* other non important args */)
{
    if (!ctx)
        return AN_ERROR_CODE;

    void *data = (void *)(long)ctx->data;
    void *data_end = (void *)(long)ctx->data_end;

    struct ethhdr *eth;
    struct iphdr *ipv4_hdr = NULL;
    struct ipv6hdr *ipv6_hdr = NULL;
    struct udphdr *udph;
    uint16_t ethertype;

    eth = (struct ethhdr *)data;
    if (eth + 1 > data_end)
        return AN_ERROR_CODE;

    ethertype = __constant_ntohs(eth->h_proto);
    if (ethertype == ETH_P_IP)
    {
        ipv4_hdr = (void *)eth + ETH_HLEN;
        if (ipv4_hdr + 1 > data_end)
            return AN_ERROR_CODE;

        // stuff non related to the issue ...
    } else if (ethertype == ETH_P_IPV6)
    {
        ipv6_hdr = (void *)eth + ETH_HLEN;
        if (ipv6_hdr + 1 > data_end)
            return AN_ERROR_CODE;

        // stuff non related to the issue ...
    } else
        return A_RET_CODE_1;

    /* here's the problem, but ... */
    udph = (ipv4_hdr) ? ((void *)ipv4_hdr + sizeof(*ipv4_hdr)) :
            ((void *)ipv6_hdr + sizeof(*ipv6_hdr));
    if (udph + 1 > data_end)
        return AN_ERROR_CODE;

    /* it actually breaks HERE, when dereferencing 'udph' */
    uint16_t dst_port = __constant_ntohs(udph->dest);

    // blablabla other stuff here unrelated to the problem ...

    return A_RET_CODE_2;
}

那么，为什么它会在此时中断？我认为这是因为验证者认为ipv6_hdr可能是NULL，这是完全错误的，因为如果执行达到了这一点，那仅仅是因为ipv4_hdr或ipv6_hdr已设置（即，如果既不是IPv4也不是IPv6，则执行在此之前终止）。因此，显然，验证者无法推断出这一点。但是，有一个陷阱，很高兴的是，也可以像这样显式检查ipv6_hdr的有效性：

if (ipv4_hdr)
    udph = (void *)ipv4_hdr + sizeof(*ipv4_hdr);
else if (ipv6_hdr)
    udph = (void *)ipv6_hdr + sizeof(*ipv6_hdr);
else return A_RET_CODE_1;  // this is redundant

如果这样做，它也可以工作：

// "(ethertype == ETH_P_IP)" instead of "(ipv4_hdr)"
udph = (ethertype == ETH_P_IP) ? ((void *)ipv4_hdr + sizeof(*ipv4_hdr)) :
        ((void *)ipv6_hdr + sizeof(*ipv6_hdr));

因此，在我看来，这里的验证程序有些奇怪，因为它不够聪明（也许也不需要？），以至于意识到到这一点，仅是因为ctx指IPv4或IPv6数据包。

所有这些如何解释return act;中对entry_point()的抱怨？很简单，请忍受我。 some_inlined_func()不会更改ctx，entry_point()也不会使用其剩余的args。因此，在返回act的情况下，some_inlined_func()取决于some_inlined_func()的结果，因此执行XDP_<whatever>时，验证者会抱怨。但是，如果返回switch-case作为some_inlined_func()主体，并且entry_point()都不会改变some_inlined_func()程序/函数的内部状态，则编译器（使用O2）足够聪明，可以意识到为switch-case和整个XDP_<whatever>（这是这里的O2优化）生产程序集毫无意义。因此，总而言之，在返回some_inlined_func()的情况下，验证者很高兴，因为问题实际上出在some_inlined_func()中，但是实际生产的BPF程序集没有任何问题，因此验证者没有检查dataframe是因为最初没有任何内容。有道理吗？

这样的BPF“限制”是否已知？是否有任何文件说明此类已知限制？因为我没找到。

加载eBPF文件对象时如何解决“ R0无效的内存访问'inv'”错误

1 个答案: