如何将Linux内核缓冲区映射到用户空间?

19
假设缓冲区使用基于页面的方案进行分配。实现mmap的一种方法是使用remap_pfn_range,但是LDD3表示这对于传统内存不起作用。看起来我们可以通过使用SetPageReserved标记页面保留来解决此问题,以便它被锁定在内存中。但是,所有内核内存不都已经是不可交换的吗?即已经被保留了吗?为什么需要显式设置保留位?
这是否与从HIGH_MEM分配的页有关?

2
不确定这是否有帮助,但据我所知,内核中的性能子系统提供了一组可以被用户空间应用程序mmap的内核内存页面(实际上是一个环形缓冲区)。它的实现可能会对你的问题提供一些提示,也许值得查看其源代码。 - Eugene
3个回答

24

在您的mmap方法中映射内核一组页面的最简单方法是使用故障处理程序来映射这些页面。基本上,您最终会得到类似于以下内容的东西:

static int my_mmap(struct file *filp, struct vm_area_struct *vma)
{
    vma->vm_ops = &my_vm_ops;
    return 0;
}

static const struct file_operations my_fops = {
    .owner  = THIS_MODULE,
    .open   = nonseekable_open,
    .mmap   = my_mmap,
    .llseek = no_llseek,
};

(其中其他文件操作是您的模块所需的任何内容)。在my_mmap中,您可以执行必要的范围检查等操作以验证mmap参数。

然后,vm_ops如下:

static int my_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
    vmf->page = my_page_at_index(vmf->pgoff);
    get_page(vmf->page);

    return 0;
} 

static const struct vm_operations_struct my_vm_ops = {
    .fault      = my_fault
}

你只需要为传递给故障函数的vma / vmf找出要映射到用户空间的页面。这取决于您的模块的工作方式。例如,如果您执行了...

my_buf = vmalloc_user(MY_BUF_SIZE);

那么您使用的页面应该是这样的

vmalloc_to_page(my_buf + (vmf->pgoff << PAGE_SHIFT));

但你可以轻松地创建一个数组,并为每个条目分配一页,使用kmalloc或其他方法。

[刚注意到my_fault是一个有点好笑的函数名]


谢谢。这非常有帮助。但是,在故障处理程序中,我们不需要调用vm_insert_page吗?此外,谁会撤消get_page以允许稍后释放页面?我想一旦用户空间执行munmap,我们可以从vma_close中获得一些代码练习,在其中我们可以为所有故障的页面放置页面。这是正确的方法吗? - ravi
3
不需要执行vm_insert_page,如果已经设置了vmf->page。如果你正在映射非页面支持的设备内存,则可能需要使用vm_insert_pfn(),但实际上你可能不需要担心这个。当映射被撤销时,核心vm代码会处理put_page()。对于将内核内存映射到用户空间的简单驱动程序而言,我已经向你展示了几乎所有你需要的东西。 - Roland
你好。如果无法使用vmalloc()分配my_buf缓冲区(因为太大),那么my_fault()方法的主体将是什么?我的意思是按需逐页分配。 - user1284631
如果您想按需分配页面,则故障例程应该分配该页面并将vmf->page设置为您分配的页面。 - Roland
你能解释一下什么时候会调用回调函数fault()吗? - Micheal XIV
@Roland,我想使用PCI驱动程序实现mmap。您能否请看一下这个问题https://stackoverflow.com/questions/65749351/in-order-to-write-pci-ethernet-driver-how-to-implement-mmap-in-the-pci-ethernet? - user786

13

最小可运行示例和用户空间测试

内核模块:

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h> /* min */
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/uaccess.h> /* copy_from_user, copy_to_user */
#include <linux/slab.h>

static const char *filename = "lkmc_mmap";

enum { BUFFER_SIZE = 4 };

struct mmap_info {
    char *data;
};

/* After unmap. */
static void vm_close(struct vm_area_struct *vma)
{
    pr_info("vm_close\n");
}

/* First page access. */
static vm_fault_t vm_fault(struct vm_fault *vmf)
{
    struct page *page;
    struct mmap_info *info;

    pr_info("vm_fault\n");
    info = (struct mmap_info *)vmf->vma->vm_private_data;
    if (info->data) {
        page = virt_to_page(info->data);
        get_page(page);
        vmf->page = page;
    }
    return 0;
}

/* After mmap. TODO vs mmap, when can this happen at a different time than mmap? */
static void vm_open(struct vm_area_struct *vma)
{
    pr_info("vm_open\n");
}

static struct vm_operations_struct vm_ops =
{
    .close = vm_close,
    .fault = vm_fault,
    .open = vm_open,
};

static int mmap(struct file *filp, struct vm_area_struct *vma)
{
    pr_info("mmap\n");
    vma->vm_ops = &vm_ops;
    vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
    vma->vm_private_data = filp->private_data;
    vm_open(vma);
    return 0;
}

static int open(struct inode *inode, struct file *filp)
{
    struct mmap_info *info;

    pr_info("open\n");
    info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
    pr_info("virt_to_phys = 0x%llx\n", (unsigned long long)virt_to_phys((void *)info));
    info->data = (char *)get_zeroed_page(GFP_KERNEL);
    memcpy(info->data, "asdf", BUFFER_SIZE);
    filp->private_data = info;
    return 0;
}

static ssize_t read(struct file *filp, char __user *buf, size_t len, loff_t *off)
{
    struct mmap_info *info;
    ssize_t ret;

    pr_info("read\n");
    if ((size_t)BUFFER_SIZE <= *off) {
        ret = 0;
    } else {
        info = filp->private_data;
        ret = min(len, (size_t)BUFFER_SIZE - (size_t)*off);
        if (copy_to_user(buf, info->data + *off, ret)) {
            ret = -EFAULT;
        } else {
            *off += ret;
        }
    }
    return ret;
}

static ssize_t write(struct file *filp, const char __user *buf, size_t len, loff_t *off)
{
    struct mmap_info *info;

    pr_info("write\n");
    info = filp->private_data;
    if (copy_from_user(info->data, buf, min(len, (size_t)BUFFER_SIZE))) {
        return -EFAULT;
    } else {
        return len;
    }
}

static int release(struct inode *inode, struct file *filp)
{
    struct mmap_info *info;

    pr_info("release\n");
    info = filp->private_data;
    free_page((unsigned long)info->data);
    kfree(info);
    filp->private_data = NULL;
    return 0;
}

static const struct file_operations fops = {
    .mmap = mmap,
    .open = open,
    .release = release,
    .read = read,
    .write = write,
};

static int myinit(void)
{
    proc_create(filename, 0, NULL, &fops);
    return 0;
}

static void myexit(void)
{
    remove_proc_entry(filename, NULL);
}

module_init(myinit)
module_exit(myexit)
MODULE_LICENSE("GPL");

GitHub上游

用户空间测试:

#define _XOPEN_SOURCE 700
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h> /* uintmax_t */
#include <string.h>
#include <sys/mman.h>
#include <unistd.h> /* sysconf */

/* Format documented at:
 * https://github.com/torvalds/linux/blob/v4.9/Documentation/vm/pagemap.txt
 */
typedef struct {
    uint64_t pfn : 54;
    unsigned int soft_dirty : 1;
    unsigned int file_page : 1;
    unsigned int swapped : 1;
    unsigned int present : 1;
} PagemapEntry;

/* Parse the pagemap entry for the given virtual address.
 *
 * @param[out] entry      the parsed entry
 * @param[in]  pagemap_fd file descriptor to an open /proc/pid/pagemap file
 * @param[in]  vaddr      virtual address to get entry for
 * @return                0 for success, 1 for failure
 */
int pagemap_get_entry(PagemapEntry *entry, int pagemap_fd, uintptr_t vaddr)
{
    size_t nread;
    ssize_t ret;
    uint64_t data;

    nread = 0;
    while (nread < sizeof(data)) {
        ret = pread(pagemap_fd, ((uint8_t*)&data) + nread, sizeof(data),
                (vaddr / sysconf(_SC_PAGE_SIZE)) * sizeof(data) + nread);
        nread += ret;
        if (ret <= 0) {
            return 1;
        }
    }
    entry->pfn = data & (((uint64_t)1 << 54) - 1);
    entry->soft_dirty = (data >> 54) & 1;
    entry->file_page = (data >> 61) & 1;
    entry->swapped = (data >> 62) & 1;
    entry->present = (data >> 63) & 1;
    return 0;
}

/* Convert the given virtual address to physical using /proc/PID/pagemap.
 *
 * @param[out] paddr physical address
 * @param[in]  pid   process to convert for
 * @param[in] vaddr  virtual address to get entry for
 * @return           0 for success, 1 for failure
 */
int virt_to_phys_user(uintptr_t *paddr, pid_t pid, uintptr_t vaddr)
{
    char pagemap_file[BUFSIZ];
    int pagemap_fd;

    snprintf(pagemap_file, sizeof(pagemap_file), "/proc/%ju/pagemap", (uintmax_t)pid);
    pagemap_fd = open(pagemap_file, O_RDONLY);
    if (pagemap_fd < 0) {
        return 1;
    }
    PagemapEntry entry;
    if (pagemap_get_entry(&entry, pagemap_fd, vaddr)) {
        return 1;
    }
    close(pagemap_fd);
    *paddr = (entry.pfn * sysconf(_SC_PAGE_SIZE)) + (vaddr % sysconf(_SC_PAGE_SIZE));
    return 0;
}

enum { BUFFER_SIZE = 4 };

int main(int argc, char **argv)
{
    int fd;
    long page_size;
    char *address1, *address2;
    char buf[BUFFER_SIZE];
    uintptr_t paddr;

    if (argc < 2) {
        printf("Usage: %s <mmap_file>\n", argv[0]);
        return EXIT_FAILURE;
    }
    page_size = sysconf(_SC_PAGE_SIZE);
    printf("open pathname = %s\n", argv[1]);
    fd = open(argv[1], O_RDWR | O_SYNC);
    if (fd < 0) {
        perror("open");
        assert(0);
    }
    printf("fd = %d\n", fd);

    /* mmap twice for double fun. */
    puts("mmap 1");
    address1 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (address1 == MAP_FAILED) {
        perror("mmap");
        assert(0);
    }
    puts("mmap 2");
    address2 = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (address2 == MAP_FAILED) {
        perror("mmap");
        return EXIT_FAILURE;
    }
    assert(address1 != address2);

    /* Read and modify memory. */
    puts("access 1");
    assert(!strcmp(address1, "asdf"));
    /* vm_fault */
    puts("access 2");
    assert(!strcmp(address2, "asdf"));
    /* vm_fault */
    strcpy(address1, "qwer");
    /* Also modified. So both virtual addresses point to the same physical address. */
    assert(!strcmp(address2, "qwer"));

    /* Check that the physical addresses are the same.
     * They are, but TODO why virt_to_phys on kernel gives a different value? */
    assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address1));
    printf("paddr1 = 0x%jx\n", (uintmax_t)paddr);
    assert(!virt_to_phys_user(&paddr, getpid(), (uintptr_t)address2));
    printf("paddr2 = 0x%jx\n", (uintmax_t)paddr);

    /* Check that modifications made from userland are also visible from the kernel. */
    read(fd, buf, BUFFER_SIZE);
    assert(!memcmp(buf, "qwer", BUFFER_SIZE));

    /* Modify the data from the kernel, and check that the change is visible from userland. */
    write(fd, "zxcv", 4);
    assert(!strcmp(address1, "zxcv"));
    assert(!strcmp(address2, "zxcv"));

    /* Cleanup. */
    puts("munmap 1");
    if (munmap(address1, page_size)) {
        perror("munmap");
        assert(0);
    }
    puts("munmap 2");
    if (munmap(address2, page_size)) {
        perror("munmap");
        assert(0);
    }
    puts("close");
    close(fd);
    return EXIT_SUCCESS;
}

GitHub上游

在5.4.3内核上测试通过。


1
谢谢提供代码。用户空间测试由于 #include "commom.h" 无法编译(我们需要它吗?)。另外,#define _XOPEN_SOURCE 700 是什么意思? - Mixaz
1
@Mixaz 谢谢你让我知道,我忘了那个,如果修复了请告诉我。请注意,我有指向我的上游链接,它们指向:https://github.com/cirosantilli/linux-kernel-module-cheat/blob/8d668d6ed3617cc47425e1413513a2d1f99a25fd/kernel_module/user/common.h 顺便说一句,只需使用该存储库即可永远快乐:https://github.com/cirosantilli/linux-kernel-module-cheat/tree/e11483015813f720d0bc5e62bdc2e9ba00a9fd83#qemu-buildroot-setup :-) - Ciro Santilli OurBigBook.com
1
感谢您的及时更新,现在编译和运行都很好!事实上,我没有注意到链接,让我在您的帖子中使它们更加明显。 - Mixaz
1
从版本4.10开始,在“struct vm_operations_struct”中,“vm_fault”的原型已更改。应该现在从“vm_fault”(vmf->vma)访问“vm_area_struct”。[链接](https://elixir.bootlin.com/linux/v4.10-rc1/source/include/linux/mm.h#L294) - Digvijay Chougale
1
@TheAhmad 是的,cat /proc/lkmc_mmap 会导致无限循环,我主要关注的是非常简单的 read,但我想更好的使用另一个例子中的更正常的 read(如 https://github.com/cirosantilli/linux-kernel-module-cheat/blob/2ea5e17d23553334c23934d83965de8a47df3780/kernel_modules/fops.c),已更新。关于匿名映射,您是否指的是在 mmap 调用上使用 MAP_ANONYMOUS?如果是这样,那么您将如何与设备驱动程序通信,因为它忽略了文件指针? - Ciro Santilli OurBigBook.com
显示剩余3条评论

0

尽管页面是通过内核驱动程序保留的,但它旨在通过用户空间访问。因此,PTE(页表项)不知道pfn属于用户空间还是内核空间(即使它们是通过内核驱动程序分配的)。

这就是为什么它们被标记为SetPageReserved的原因。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接