Question

完成fcntl(memfd, F_ADD_SEALS, F_SEAL_WRITE);后，类似mmap(NULL, 4096, PROT_READ, MAP_SHARED, memfd, 0);的调用将失败，并显示错误EPERM。基于man 2 fcntl，我对F_SEAL_WRITE的理解是，它仅防止可写的共享映射。同样，如果我在拥有这样的只读内存映射的同时执行fcntl，则它将失败并显示错误EBUSY，就像我只希望映射可写时那样。为什么会这样？

MCVE：

#include <unistd.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <sys/mman.h>

int main(void) {
    void *buf;
    int memfd = syscall(SYS_memfd_create, "foo", 2 /* MFD_ALLOW_SEALING */);
    ftruncate(memfd, 4096);
    buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED, memfd, 0);
    fcntl(memfd, 1033 /* F_ADD_SEALS */, 8 /* F_SEAL_WRITE */); // will fail
    munmap(buf, 4096);
    fcntl(memfd, 1033 /* F_ADD_SEALS */, 8 /* F_SEAL_WRITE */);
    buf = mmap(NULL, 4096, PROT_READ, MAP_SHARED, memfd, 0); // will fail
    return 0;
}

在strace下运行时（在Linux 4.4.0-135（从Ubuntu 16.04开始通用）上），它将产生以下内容：

memfd_create("foo", MFD_ALLOW_SEALING)  = 3
ftruncate(3, 4096)                      = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = 0x7fd9a9865000
fcntl(3, F_ADD_SEALS, F_SEAL_WRITE)     = -1 EBUSY (Device or resource busy)
munmap(0x7fd9a9865000, 4096)            = 0
fcntl(3, F_ADD_SEALS, F_SEAL_WRITE)     = 0
mmap(NULL, 4096, PROT_READ, MAP_SHARED, 3, 0) = -1 EPERM (Operation not permitted)

Answer 1

来自man 2 fcntl：

如果存在任何可写的共享映射，则使用F_ADD_SEALS操作设置F_SEAL_WRITE密封将失败，并且EBUSY失败。

您的mmap似乎没有创建可写的映射，因此该不应适用。手册页可能有误。

但是，低于实际的内核代码[顶层]。以下大多数内容来自mm/memfd.c。

您可以从 {em> EBUSY或mapping_deny_writable中获得memfd_wait_for_pins。

我最好的猜测是mmap会增加计数，因此mapping_deny_writable会失败，或者ftruncate会有一些映射将问题固定住。

从后者看来，[一段时间后]可以消除固定，因此，对EBUSY错误进行几次旋转可能会有所帮助。

static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; int error; /* * SEALING * Sealing allows multiple parties to share a tmpfs or hugetlbfs file * but restrict access to a specific subset of file operations. Seals * can only be added, but never removed. This way, mutually untrusted * parties can share common memory regions with a well-defined policy. * A malicious peer can thus never perform unwanted operations on a * shared object. * * Seals are only supported on special tmpfs or hugetlbfs files and * always affect the whole underlying inode. Once a seal is set, it * may prevent some kinds of access to the file. Currently, the * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file * only adds a given set of seals to the file, it never touches * existing seals. Furthermore, the "setting seals"-operation can be * sealed itself, which basically prevents any further seal from being * added. * * Semantics of sealing are only defined on volatile files. Only * anonymous tmpfs and hugetlbfs files support sealing. More * importantly, seals are never written to disk. Therefore, there's * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; inode_lock(inode); file_seals = memfd_file_seals_ptr(file); if (!file_seals) { error = -EINVAL; goto unlock; } if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; } } *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; }

这里是mapping_deny_writable：

static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; }

这里是memfd_wait_for_pins：

/* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references * to those pages to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; pgoff_t start; struct page *page; int error, scan; memfd_tag_pins(mapping); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) break; if (!scan) lru_add_drain_all(); else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; start = 0; rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, MEMFD_TAG_PINNED) { page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } page = NULL; } if (page && page_count(page) - page_mapcount(page) != 1) { if (scan < LAST_SCAN) goto continue_resched; /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found pages pinned. */ error = -EBUSY; } xa_lock_irq(&mapping->i_pages); radix_tree_tag_clear(&mapping->i_pages, iter.index, MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); } } rcu_read_unlock(); } return error; }

设置F_SEAL_WRITE后，为什么不能创建只读的共享映射？

1 个答案: