解析固件时,是什么导致此内核出现故障?

时间:2020-09-04 14:20:34

标签: c linux linux-kernel linux-device-driver

我正在调整一个内核模块,该模块注册一堆struct software_node链接摄像头传感器到英特尔的ipu3基础架构。注册完成后,模块将触发manifest documentation。在重新探测期间,它将解析固件以建立我需要的连接。在固件解析期间,当内核尝试调用fwnode.ops

的成员时,我遇到一个内核哎呀抱怨我试图在NX内存中执行地址。
[  639.079642] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
[  639.079652] BUG: unable to handle page fault for address: ffffffffc11860e0
[  639.079658] #PF: supervisor instruction fetch in kernel mode
[  639.079665] #PF: error_code(0x0011) - permissions violation
[  639.079671] PGD 140a0f067 P4D 140a0f067 PUD 140a11067 PMD 13e66a067 PTE 8000000027838061
[  639.079686] Oops: 0011 [#1] SMP PTI
[  639.079696] CPU: 1 PID: 5259 Comm: insmod Tainted: G       A C OE     5.8.0-rc7-debug #83
[  639.079703] Hardware name: LENOVO 80U1/INVALID, BIOS 2KCN25WW 10/26/2016
[  639.079719] RIP: 0010:surface_camera_exit+0x790/0x6b0 [surface_camera]
[  639.079728] Code: 66 72 65 71 75 65 6e 63 79 00 62 75 73 2d 74 79 70 65 00 63 6c 6f 63 6b 2d 6c 61 6e 65 73 00 64 61 74 61 2d 6c 61 6e 65 73 00 <70> 6f 72 74 30 00 65 6e 64 70 6f 69 6e 74 30 00 01 36 46 6f 75 6e
[  639.079736] RSP: 0018:ffffa730c1fdb8b8 EFLAGS: 00010282
[  639.079744] RAX: ffff9228f877b1d8 RBX: ffff9228f877b1d8 RCX: 0000000000000027
[  639.079750] RDX: ffffffffc11860e0 RSI: ffff9229eec98cd0 RDI: ffff9228f877b1d8
[  639.079756] RBP: ffffa730c1fdb8c0 R08: 0000000000000752 R09: 0000000000000004
[  639.079763] R10: 000000000000000a R11: 0000000000000001 R12: ffff9228f877b190
[  639.079769] R13: ffff9228f877b208 R14: ffff9228f877ba08 R15: ffff9228f877b708
[  639.079777] FS:  00007f4c54802540(0000) GS:ffff9229eec80000(0000) knlGS:0000000000000000
[  639.079784] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  639.079791] CR2: ffffffffc11860e0 CR3: 000000003f7f2003 CR4: 00000000003606e0
[  639.079797] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  639.079803] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  639.079809] Call Trace:
[  639.079822]  ? fwnode_handle_get+0x29/0x30
[  639.079834]  software_node_get_next_child+0x99/0x1f0
[  639.079844]  software_node_graph_get_next_endpoint+0xb8/0x140
[  639.079854]  fwnode_graph_get_endpoint_by_id+0x80/0x1b0
[  639.079870]  cio2_pci_probe+0x64c/0xb55 [ipu3_cio2]
[  639.079890]  local_pci_probe+0x47/0xa0
[  639.079900]  ? local_pci_probe+0x47/0xa0
[  639.079913]  pci_device_probe+0x10b/0x1b0
[  639.079924]  really_probe+0xf5/0x440
[  639.079934]  driver_probe_device+0xe8/0x150
[  639.079943]  __device_attach_driver+0x7b/0xe0
[  639.079952]  ? driver_allows_async_probing+0x60/0x60
[  639.079963]  bus_for_each_drv+0x6e/0xb0
[  639.079972]  __device_attach+0xd8/0x160
[  639.079984]  device_attach+0x10/0x20
[  639.079991]  bus_rescan_devices_helper+0x3a/0x80
[  639.079998]  device_reprobe+0x23/0x30
[  639.080010]  surface_camera_init+0x1bb/0x250 [surface_camera]
[  639.080022]  ? surface_camera_unregister_nodes+0x110/0x110 [surface_camera]
[  639.080034]  do_one_initcall+0x4a/0x200
[  639.080047]  ? _cond_resched+0x19/0x40
[  639.080059]  ? kmem_cache_alloc_trace+0x165/0x220
[  639.080072]  do_init_module+0x5f/0x21a
[  639.080080]  load_module+0x26a5/0x2d30
[  639.080099]  __do_sys_finit_module+0xfc/0x120
[  639.080107]  ? __do_sys_finit_module+0xfc/0x120
[  639.080121]  __x64_sys_finit_module+0x1a/0x20
[  639.080128]  do_syscall_64+0x48/0xc0
[  639.080139]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  639.080148] RIP: 0033:0x7f4c5433e959
[  639.080158] Code: 00 f3 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ff f4 2c 00 f7 d8 64 89 01 48
[  639.080165] RSP: 002b:00007ffd0b94ad08 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
[  639.080174] RAX: ffffffffffffffda RBX: 000055699a8587c0 RCX: 00007f4c5433e959
[  639.080180] RDX: 0000000000000000 RSI: 000055699a1decee RDI: 0000000000000003
[  639.080187] RBP: 000055699a1decee R08: 0000000000000000 R09: 00007f4c54611000
[  639.080192] R10: 0000000000000003 R11: 0000000000000246 R12: 0000000000000000
[  639.080197] R13: 000055699a85b190 R14: 0000000000000000 R15: 0000000000000000
[  639.080207] Modules linked in: surface_camera(OE+) ov5648(OE) ov2680(OE) rfcomm xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo xt_addrtype iptable_filter iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c br_netfilter bridge stp llc ccm hid_logitech_hidpp overlay cmac bnep btusb btrtl btbcm btintel bluetooth hid_logitech_dj hid_multitouch ecdh_generic ecc binfmt_misc nls_iso8859_1 dm_crypt mousedev joydev wacom clk_tps68470 tps68470_regulator usbhid hid_generic mei_hdcp intel_rapl_msr snd_soc_skl snd_hda_codec_hdmi snd_soc_sst_ipc snd_soc_sst_dsp snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi snd_hda_codec_realtek snd_soc_core snd_hda_codec_generic x86_pkg_temp_thermal ledtrig_audio intel_powerclamp snd_compress coretemp ac97_bus snd_pcm_dmaengine kvm_intel snd_hda_intel snd_intel_dspcfg snd_hda_codec kvm snd_hda_core snd_hwdep snd_pcm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_seq_midi
[  639.080298]  snd_seq_midi_event snd_rawmidi snd_seq aesni_intel crypto_simd cryptd snd_seq_device glue_helper snd_timer rapl iwlmvm intel_cstate mac80211 libarc4 input_leds serio_raw iwlwifi snd wmi_bmof efi_pstore soundcore intel_wmi_thunderbolt cfg80211 mei_me ipu3_imgu(C) ipu3_cio2 processor_thermal_device mei intel_lpss_pci v4l2_fwnode intel_lpss videobuf2_dma_sg idma64 videobuf2_memops virt_dma videobuf2_v4l2 bmc150_accel_i2c videobuf2_common intel_rapl_common bmc150_accel_core intel_xhci_usb_role_switch intel_pch_thermal int340x_thermal_zone roles videodev industrialio_triggered_buffer kfifo_buf mc intel_soc_dts_iosf industrialio intel_vbtn sparse_keymap int3400_thermal acpi_thermal_rel acpi_pad mac_hid sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 i915 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops cec rc_core nvme ahci nvme_core drm libahci xhci_pci xhci_pci_renesas i2c_hid hid wmi video pinctrl_sunrisepoint
[  639.080391]  pinctrl_intel backlight
[  639.080404] CR2: ffffffffc11860e0o
[  639.080410] ---[ end trace c651565dd757a731 ]---
[  639.366643] RIP: 0010:surface_camera_exit+0x790/0x6b0 [surface_camera]
[  639.366647] Code: 66 72 65 71 75 65 6e 63 79 00 62 75 73 2d 74 79 70 65 00 63 6c 6f 63 6b 2d 6c 61 6e 65 73 00 64 61 74 61 2d 6c 61 6e 65 73 00 <70> 6f 72 74 30 00 65 6e 64 70 6f 69 6e 74 30 00 01 36 46 6f 75 6e
[  639.366649] RSP: 0018:ffffa730c1fdb8b8 EFLAGS: 00010282
[  639.366651] RAX: ffff9228f877b1d8 RBX: ffff9228f877b1d8 RCX: 0000000000000027
[  639.366653] RDX: ffffffffc11860e0 RSI: ffff9229eec98cd0 RDI: ffff9228f877b1d8
[  639.366654] RBP: ffffa730c1fdb8c0 R08: 0000000000000752 R09: 0000000000000004
[  639.366655] R10: 000000000000000a R11: 0000000000000001 R12: ffff9228f877b190
[  639.366657] R13: ffff9228f877b208 R14: ffff9228f877ba08 R15: ffff9228f877b708
[  639.366659] FS:  00007f4c54802540(0000) GS:ffff9229eec80000(0000) knlGS:0000000000000000
[  639.366660] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  639.366662] CR2: ffffffffc11860e0 CR3: 000000003f7f2003 CR4: 00000000003606e0
[  639.366663] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  639.366664] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

现在,我的笔记本电脑有2个传感器。注册其中任何一个模块时,该模块都能正常工作(它通过ACPI手柄检测到的传感器,但是在dev->driver_data中找不到任何数据,它只是忽略了该传感器,因此modprobe -r的传感器模块让我确认它对于一个传感器)。因此,大概我只是以某种方式搞砸了注册,但是经过几天的尝试,我看不到自己做错了什么。在调试工作中,我得出的结论是,遇到的 first 传感器已正确注册(这是有意义的,因为它们是独立工作的)。在应用了一些补丁之后,由fwnode_handle_get触发对software_node_get_next_child的调用:请参见以下链接:

static struct fwnode_handle *
software_node_get_next_child(const struct fwnode_handle *fwnode,
                 struct fwnode_handle *child)
{
    struct swnode *p = to_swnode(fwnode);
    struct swnode *c = to_swnode(child);

    if (!p || list_empty(&p->children) ||
        (c && list_is_last(&c->entry, &p->children)))
        return NULL;

    if (c)
        c = list_next_entry(c, entry);
    else
        c = list_first_entry(&p->children, struct swnode, entry);
    return fwnode_handle_get(&c->fwnode);
}

据我所知,由于c = list_next_entry(c, entry)返回了无效的指针,因此发生了错误。我为此添加了一些调试打印:

        pr_info("Before list_next_entry\n");
            pr_info("    c->node is called %s\n", c->node->name);
            pr_info("    its address is %d\n", c->node);
            pr_info("    addr of c->fwnode is %d\n", &c->fwnode);
        c = list_next_entry(c, entry);
        pr_info("After list_next_entry\n");
        pr_info("    addr of c: %d\n", c);
        pr_info("    addr of c->node: %d\n", c->node);
        // pr_info("addr of c->node->name: %d\n", c->node->name);
        pr_info("    addr of c->fwnodw: %d\n", &c->fwnode);
[  639.079614] Before list_next_entry
[  639.079616]     c->node is called endpoint0
[  639.079618]     its address is -1055361936
[  639.079619]     addr of c->fwnode is -126372088
[  639.079621] After list_next_entry
[  639.079624]     addr of c: -126373488
[  639.079625]     addr of c->node: 3
[  639.079627]     addr of c->fwnodw: -126373416

“ 3”不是有效的内存地址,因此无论swnode是由什么构成的,它似乎都不是我最初注册的software_nodes之一。但是在那种情况下,我不知道为什么list_next_entry会返回它。实际上,根据我的估计,应该从未遇到过这段代码:list_is_last(&c->entry, &p->children)应该是true,因为我每个端口只注册一个孩子。 c->node->name节点返回endpoint0

完整的模块相当大,很难最小化,但是在我仍然可以重现错误的同时,它可以做到的很小。但是请注意,如果您想实际运行代码,则需要一台带有cio2设备的设备,该设备使用ipu3驱动程序以及两个传感器(您需要用其替换为supported_devices数组的ACPI HID)来重现它。瞧瞧。

#include <linux/acpi.h>
#include <acpi/acpi_bus.h>
#include <linux/device.h>
#include <linux/i2c.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <media/v4l2-subdev.h>

#include <linux/fwnode.h>

#define MAX_CONNECTED_DEVICES                4
#define SWNODE_SENSOR_HID                    0
#define SWNODE_SENSOR_PORT                   1
#define SWNODE_SENSOR_ENDPOINT               2
#define SWNODE_CIO2_PORT                     3
#define SWNODE_CIO2_ENDPOINT                 4
#define SWNODE_NULL_TERMINATOR               5

#define CIO2_HID                     "INT343E"
#define CIO2_PCI_ID                     0x9d32

#define ENDPOINT_SENSOR                      0
#define ENDPOINT_CIO2                        1

#define NODE_HID(_HID)                       \
    (const struct software_node) {           \
        _HID,                                \
    }

#define NODE_PORT(_PORT, _HID_NODE)          \
    (const struct software_node) {           \
        _PORT,                               \
        _HID_NODE,                           \
    }

#define NODE_ENDPOINT(_EP, _PORT, _PROPS)    \
    (const struct software_node) {           \
        _EP,                                 \
        _PORT,                               \
        _PROPS,                              \
    }

#define PROPERTY_ENTRY_NULL                  \
    (const struct property_entry) { }

#define SOFTWARE_NODE_NULL                   \
    (const struct software_node) { }

static char* supported_devices[] = {
    "OVTI5648",
    "OVTI2680",
};

const char *port_names[] = {
    "port0", "port1"
};

struct pci_dev *cio2;

struct software_node cio2_hid_node = { CIO2_HID, };

struct sensor {
    struct software_node swnodes[7];
    struct property_entry sensor_props[6];
    struct property_entry cio2_props[3];
    struct fwnode_handle *fwnode;
};

struct connected_devices {
    int n_devices;
    struct sensor sensors[MAX_CONNECTED_DEVICES];
};

struct connected_devices connected_devs = {
    .n_devices = 0,
};

static const struct property_entry remote_endpoints[] = {
    PROPERTY_ENTRY_REF("remote-endpoint", &connected_devs.sensors[0].swnodes[SWNODE_CIO2_ENDPOINT]),    /* Sensor 0, Sensor Property */
    PROPERTY_ENTRY_REF("remote-endpoint", &connected_devs.sensors[0].swnodes[SWNODE_SENSOR_ENDPOINT]),    /* Sensor 0, CIO2 Property */
    PROPERTY_ENTRY_REF("remote-endpoint", &connected_devs.sensors[1].swnodes[SWNODE_CIO2_ENDPOINT]),    
    PROPERTY_ENTRY_REF("remote-endpoint", &connected_devs.sensors[1].swnodes[SWNODE_SENSOR_ENDPOINT]),
    { }
};

static int connect_supported_devices(struct device *dev, void *data)
{
    struct acpi_device *adev;
    struct connected_devices *cdevs = data;
    struct property_entry *sensor_props;
    struct property_entry *cio2_props;
    struct fwnode_handle *fwnode;
    struct software_node *nodes;
    const char *hid;
    int i, ret;

    adev = ACPI_COMPANION(dev);
    if (!adev) {
        return 0;
    }

    hid = acpi_device_hid(adev);

    for (i = 0; i < ARRAY_SIZE(supported_devices); i++) {
        if (!strcmp(hid, supported_devices[i])) { 

            if (!dev->driver_data) {
                pr_info("Found supported device %s, but it has no driver; skipping\n", hid);
                return 0;
            }

            nodes = cdevs->sensors[cdevs->n_devices].swnodes;
            sensor_props = cdevs->sensors[cdevs->n_devices].sensor_props;
            cio2_props = cdevs->sensors[cdevs->n_devices].cio2_props;
            fwnode = cdevs->sensors[cdevs->n_devices].fwnode;

            sensor_props[0] = remote_endpoints[(cdevs->n_devices * 2) + ENDPOINT_SENSOR];
            sensor_props[1] = PROPERTY_ENTRY_NULL;

            cio2_props[0] = remote_endpoints[(cdevs->n_devices * 2) + ENDPOINT_CIO2];
            cio2_props[1] = PROPERTY_ENTRY_NULL;

            /* build the software nodes */

            nodes[SWNODE_SENSOR_HID] = NODE_HID(supported_devices[i]);                                                /* Sensor HID Node */
            nodes[SWNODE_SENSOR_PORT] = NODE_PORT("port0", &nodes[SWNODE_SENSOR_HID]);                                /* Sensor Port Node */
            nodes[SWNODE_SENSOR_ENDPOINT] = NODE_ENDPOINT("endpoint0", &nodes[SWNODE_SENSOR_PORT], sensor_props);     /* Sensor Endpoint Node */
            nodes[SWNODE_CIO2_PORT] = NODE_PORT(port_names[i], &cio2_hid_node);                                       /* CIO2 Port Node */
            nodes[SWNODE_CIO2_ENDPOINT] = NODE_ENDPOINT("endpoint0", &nodes[SWNODE_CIO2_PORT], cio2_props);           /* CIO2 Endpoint Node */
            nodes[SWNODE_NULL_TERMINATOR] = SOFTWARE_NODE_NULL;

            ret = software_node_register_nodes(nodes);
            if (ret) {
                dev_err(dev, "Failed to register the software nodes for %s\n", supported_devices[i]);
                return 0;
            }

            fwnode = software_node_fwnode(&nodes[SWNODE_SENSOR_HID]);
            if (!fwnode) {
                dev_err(dev, "Failed to get fwnode from software node for %s\n", supported_devices[i]);
                return 0;
            }

            //fwnode->secondary = ERR_PTR(-ENODEV);
            dev->fwnode = fwnode;
            ((struct v4l2_subdev *)dev->driver_data)->fwnode = fwnode;

            cdevs->n_devices++;

            return 0;
        }
    }

    return 0;
}

static int surface_camera_init(void)
{
    struct fwnode_handle *fwnode;
    int ret;

    /* Register the CIO2 Parent node */
    ret = software_node_register(&cio2_hid_node);

    if (ret < 0) {
        pr_err("Failed to register the CIO2 HID node\n");
        return -EINVAL;
    }

    /* Check for supported devices and connect them*/
    ret = i2c_for_each_dev(&connected_devs, connect_supported_devices);

    if ((ret < 0) || (connected_devs.n_devices == 0)) {
        pr_err("Failed to connect any devices\n");
        goto out;
    }

    /* Find pci device and add swnode as primary */
    cio2 = pci_get_device(PCI_VENDOR_ID_INTEL, CIO2_PCI_ID, NULL);
    if (!cio2) {
        ret = -EPROBE_DEFER;
        goto out;
    }

    fwnode = software_node_fwnode(&cio2_hid_node);
    if (!fwnode) {
        pr_err("Error getting fwnode from cio2 software_node\n");
        ret = -ENODEV;
        goto out;
    }

    fwnode->secondary = ERR_PTR(-ENODEV);
    cio2->dev.fwnode = fwnode;

    pr_info("Reprobing now\n");
    ret = device_reprobe(&cio2->dev);
    if (ret) {
        dev_warn(&cio2->dev, "Reprobing error: %d\n", ret);
        goto out;
    }

    return 0;
out:
    return ret;
}

module_init(surface_camera_init);

MODULE_DESCRIPTION("A bridge driver to connect sensors to CIO2 infrastructure.");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("acpi*:INT343E:*");

问题是;内核oops的根本原因是什么?修复它的最佳方法是什么?

这是针对5.8.0-rc7内核并应用了多个补丁的;尤其是reprobing the cio2 device中的1-3(补丁5是我尝试扩展的原始模块)。

我认为这是所有需要的信息,但是如果您想检查任何内容,只需询问。

1 个答案:

答案 0 :(得分:0)

事实证明,这只是外围连接到我的代码;该错误实际上是我在问题中链接的补丁之一,现在已解决。