CVE-2021-22600 通過 Modprobe_path 及 USMA 進行漏洞利用與分析

作者： knaithe@天玄安全實驗室
原文鏈接：https://mp.weixin.qq.com/s/gu6O-ZSIiVpNJP1I9O94wQ

漏洞描述：漏洞位于/net/packet/af_packet.c文件，rx_owner_map引用了pg_vec，切換到TPACKET_V3協議版本中，在packet_set_ring()函數的末尾，對pg_vec釋放了一次，并未對rx_owner_map指針置為NULL，導致rx_owner_map成為懸空指針，直到從TPACKET_V3協議版本切換到TPACKET_V2協議版本后，在次到達packet_set_ring()函數的末尾，bitmap_free()函數對rx_owner_map指針進行釋放，觸發double free漏洞。

影響版本：Linux Kernel v5.8.0 - v5.15.0

測試版本：Linux #5.13.0

保護機制：SMEP/SMAP/KASLR/KPTI

1.漏洞分析

1.1.AF_PACKET套接字協議族

協議簡介： AF_PACKET是原始套接字協議，是一種特殊的套接字協議，可以是數據鏈路層原始套接字，也可以是網絡層原始套接字。如果是數據鏈路層原始套接字，可以直接發送和接收位于數據鏈路層的以太幀，比如Ethernet II協議，如果是網絡層原始套接字，就只能發送和接收位于網絡層的數據報文，比如IP協議。

快速使用：我們這里可以通過如下函數快速的創建一個 AF_PACKET協議的原始套接字：

socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

通過setsockopt就可以設置該套接字相關操作，比如設置當前AF_PACKET套接字協議版本為TPACKET_V3：

int version = TPACKET_V3;
setsockopt(s, SOL_PACKET, PACKET_VERSION, &version, sizeof(version));

創建ring buffer：

struct tpacket_req3 req3;
memset(&req3, 0, sizeof(req3));
req3.tp_block_size = block_size;
req3.tp_block_nr = block_nr;
req3.tp_frame_size = frame_size;
req3.tp_frame_nr = frame_nr;
req3.tp_retire_blk_tov = retire_blk_tov;
req3.tp_sizeof_priv = 0;
req3.tp_feature_req_word = 0;
setsockopt(recv_fd, SOL_PACKET, PACKET_RX_RING, &req3, sizeof(req3));

1.2.漏洞觸發

觸發過程詳解：

首先調用socket函數創建AF_PACKET套接字。
然后調用setsockopt設置協議版本為TPACKET_V3。
接著調用setsockopt設置RX_RING，正常給tpacket_req3配置參數，在執行packet_set_ring()函數過程中，pg_vec指向alloc_pg_vec()函數分配的內存，并且調用init_prb_bdqc函數，導致pg_vec被sock->rx_ring->prb_bdqc->pkbdq引用，然后調用swap函數將pg_vec和sock->rx_ring->pg_vec交換，函數最后pg_vec指向NULL，沒有調用free。
再次調用setsockopt設置RX_RING，將tpacket_req3參數的tp_block_nr和tp_frame_nr字段設置為0，然后調用swap函數將pg_vec和sock->rx_ring->pg_vec交換，此時sock->rx_ring->pg_vec為NULL，pg_vec指向上一步驟分配的內存，函數結尾調用free_pg_vec()釋放pg_vec，此時packet_ring_buffer->prb_bdqc->pkbdq成為懸空指針。
到此才可以再次調用setsockopt設置協議版本為TPACKET_V2，sock->rx_ring->pg_vec為NULL，所以該套接字切換協議TPACKET_V2成功。
最后調用setsockopt設置RX_RING，此時tpacket_req參數的tp_block_nr字段必須為0，再次進入packet_set_ring()函數，由于已經是TPACKET_V2協議，所以調用了swap函數交換了rx_owner_map和sock->rx_ring->rx_owner_map，由于packet_ring_buffer結構體的rx_owner_map成員和tpacket_kbdq_core成員屬于聯合體，所以sock->rx_ring->rx_owner_map和sock->rx_ring->prb_bdqc->pkbdq的值相同，在第4步驟packet_ring_buffer->prb_bdqc->pkbdq成為懸空指針，所以在函數結尾調用bitmap_free(rx_owner_map)，等同于free掉sock->rx_ring->prb_bdqc->pkbdq這個懸空指針，造成double free。

/net/packet/af_packet.c

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
  int closing, int tx_ring)
{
 struct pgv *pg_vec = NULL;
 struct packet_sock *po = pkt_sk(sk);
 unsigned long *rx_owner_map = NULL;
 int was_running, order = 0;
 struct packet_ring_buffer *rb;
 struct sk_buff_head *rb_queue;
 __be16 num;
 int err;
 /* Added to avoid minimal code churn */
 struct tpacket_req *req = &req_u->req;

 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

 err = -EBUSY;
 if (!closing) {
  if (atomic_read(&po->mapped))
   goto out;
  if (packet_read_pending(rb))
   goto out;
 }

 if (req->tp_block_nr) {   // 上述第4、6步，tp_block_nr字段必須為0，只允許步驟3進入
  unsigned int min_frame_size;

  /* Sanity tests and some calculations */
  err = -EBUSY;
  if (unlikely(rb->pg_vec))
   goto out;

  switch (po->tp_version) {
  case TPACKET_V1:
   po->tp_hdrlen = TPACKET_HDRLEN;
   break;
  case TPACKET_V2:
   po->tp_hdrlen = TPACKET2_HDRLEN; 
   break;
  case TPACKET_V3:
   po->tp_hdrlen = TPACKET3_HDRLEN; //  TPACKET3_HDRLEN = 0x44
   break;
  }

  err = -EINVAL;
  if (unlikely((int)req->tp_block_size <= 0))
   goto out;
  if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) // 注意tp_block_size必須與PAGE_SIZE對齊
   goto out;
  min_frame_size = po->tp_hdrlen + po->tp_reserve;
  if (po->tp_version >= TPACKET_V3 &&
      req->tp_block_size <
      BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
   goto out;
  if (unlikely(req->tp_frame_size < min_frame_size))
   goto out;
  if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
   goto out;

  rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
  if (unlikely(rb->frames_per_block == 0))
   goto out;
  if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
   goto out;
  if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
     req->tp_frame_nr))
   goto out;

  err = -ENOMEM;
  order = get_order(req->tp_block_size);
  pg_vec = alloc_pg_vec(req, order); // 步驟3進入pg_vec分配內存
  if (unlikely(!pg_vec))
   goto out;
  switch (po->tp_version) {
  case TPACKET_V3:
   /* Block transmit is not supported yet */
   if (!tx_ring) {   //  只能是RX_RING
    init_prb_bdqc(po, rb, pg_vec, req_u); // 步驟3 rb->prb_bdqc->pkbdq引用了pg_vec
   } else {
    struct tpacket_req3 *req3 = &req_u->req3;

    if (req3->tp_retire_blk_tov ||
        req3->tp_sizeof_priv ||
        req3->tp_feature_req_word) {
     err = -EINVAL;
     goto out_free_pg_vec;
    }
   }
   break;
  default:
   if (!tx_ring) {
    rx_owner_map = bitmap_alloc(req->tp_frame_nr,
     GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
    if (!rx_owner_map)
     goto out_free_pg_vec;
   }
   break;
  }
 }
 /* Done */
 else {
  err = -EINVAL;
  if (unlikely(req->tp_frame_nr))  // 上述第4、6步，tp_frame_nr字段必須為0，不能直接goto out 
   goto out;
 }


 /* Detach socket from network */
 spin_lock(&po->bind_lock);
 was_running = po->running; //release調用時，此值為0
 num = po->num;
 if (was_running) {
  WRITE_ONCE(po->num, 0);
  __unregister_prot_hook(sk, false);
 }
 spin_unlock(&po->bind_lock);

 synchronize_net();

 err = -EBUSY;
 mutex_lock(&po->pg_vec_lock);
 if (closing || atomic_read(&po->mapped) == 0) {  // closing字段一直為0，但是po->mapped字段一直等于0
  err = 0;
  spin_lock_bh(&rb_queue->lock);
  swap(rb->pg_vec, pg_vec); // 步驟3 pg_vec和rb->pg_vec交換，pg_vec為NULL，步驟4被換回來
  if (po->tp_version <= TPACKET_V2) //  只有在上述第6步，協議版本才等于TPACKET_V2，才會進入if
   swap(rb->rx_owner_map, rx_owner_map); // 步驟6 rx_owner_map指向同rb->prb_bdqc->pkbdq
  rb->frame_max = (req->tp_frame_nr - 1);
  rb->head = 0;
  rb->frame_size = req->tp_frame_size;
  spin_unlock_bh(&rb_queue->lock);

  swap(rb->pg_vec_order, order);
  swap(rb->pg_vec_len, req->tp_block_nr);

  rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
  po->prot_hook.func = (po->rx_ring.pg_vec) ?
      tpacket_rcv : packet_rcv;
  skb_queue_purge(rb_queue);
  if (atomic_read(&po->mapped))
   pr_err("packet_mmap: vma is busy: %d\n",
          atomic_read(&po->mapped));
 }
 mutex_unlock(&po->pg_vec_lock);

 spin_lock(&po->bind_lock);
 if (was_running) {
  WRITE_ONCE(po->num, num);
  register_prot_hook(sk);
 }
 spin_unlock(&po->bind_lock);
 if (pg_vec && (po->tp_version > TPACKET_V2)) {
  /* Because we don't support block-based V3 on tx-ring */
  if (!tx_ring)
   prb_shutdown_retire_blk_timer(po, rb_queue);
 }

out_free_pg_vec:
 bitmap_free(rx_owner_map);  // 步驟6 free掉rx_owner_map等于free rb->prb_bdqc->pkbdq，造成double free
 if (pg_vec)   // 步驟3由于pg_vec等于NULL為進入free,步驟4pg_vec不為NULL
  free_pg_vec(pg_vec, order, req->tp_block_nr);  // 步驟4由于釋放pg_vec，同時rb->prb_bdqc->pkbdq變為懸空指針
out:
 return err;
}

上述步驟3中，進入init_prb_bdqc()函數增加了sock->rx_ring->prb_bdqc->pkbdq引用了pg_vec。

/net/packet/af_packet.c

static void init_prb_bdqc(struct packet_sock *po,
   struct packet_ring_buffer *rb,
   struct pgv *pg_vec,
   union tpacket_req_u *req_u)
{
 struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
 struct tpacket_block_desc *pbd;

 memset(p1, 0x0, sizeof(*p1));

 p1->knxt_seq_num = 1;
 p1->pkbdq = pg_vec;   // 步驟3 sock->rx_ring->prb_bdqc->pkbdq引用了pg_vec，造成漏洞的關鍵行為
 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
 p1->pkblk_start = pg_vec[0].buffer;
 p1->kblk_size = req_u->req3.tp_block_size;
 p1->knum_blocks = req_u->req3.tp_block_nr;
 p1->hdrlen = po->tp_hdrlen;
 p1->version = po->tp_version;
 p1->last_kactive_blk_num = 0;
 po->stats.stats3.tp_freeze_q_cnt = 0;
 if (req_u->req3.tp_retire_blk_tov)
  p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
 else
  p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
      req_u->req3.tp_block_size);
 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
 rwlock_init(&p1->blk_fill_in_prog_lock);

 p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
 prb_init_ft_ops(p1, req_u);
 prb_setup_retire_blk_timer(po);
 prb_open_block(p1, pbd);
}

漏洞觸發，引發panic：

2.漏洞利用

2.1.繞過KASLR

泄露內核地址思路：通過漏洞篡改msg_msg->m_ts成員，增大msg_msg消息大小，然后再讀取該msg_msg，泄露鄰近timerfd_ctx->tmr->function這個函數指針指向的timerfd_tmrproc內核函數地址來計算內核基地址，從而繞過KASLR。

泄露內核地址詳細步驟：

1.先耗盡kmalloc-256的per_cpu上的freelist里的空閑塊，然后布局PAGE大小的dummy ringbuf；

2.第一次堆噴，首先釋放dummy ringbuf偶數下標的ringbuf，讓這些free掉的PAGE都返還給伙伴系統的order-0。然后再用pg_vec去堆噴kmalloc-256的slab，并從伙伴系統的order-0取出PAGE分成16個kmalloc-256給pg_vec；

3.第二次堆噴，釋放dummy ringbuf奇數下標的ringbuf，讓這些free掉的PAGE都返還給伙伴系統的order-0。然后用timerfd_ctx去噴kmalloc-256的slab，并從伙伴系統的order-0取剛剛歸還的PAGE分成16個kmalloc-256給timerfd_ctx；

4.第三次堆噴，通過pg_vec的漏洞釋放掉所有的第一次堆噴中的pg_vec對象，這些kmalloc-256的pg_vec不會歸還給伙伴系統，而是進入到了對應slab的空閑鏈表，接著用msg_msg從空閑鏈表再次申請出剛釋放掉的kmalloc-256的slab；

5.第四次堆噴，這時，觸發部分pg_vec的double free漏洞，然后用msg_msgseg再次將剛釋放的msg_msg從freelist里分配出來并篡改msg_msg->m_ts，這時讀取所有第三步中申請的msg_msg，即可讀取包含被篡改msg_msg->m_ts的msg_msg，從而造成OOB讀，泄露出相鄰PAGE的timerfd_ctx->tmr->function這個函數指針指向的timerfd_tmrproc內核函數地址，從而計算出當前內核基址的相對偏移。

2.2.利用方式一：篡改modprobe_path

提權思路：通過msg_msg + fuse的方式提權，篡改modprobe_path指向的字符串，modprobe_path默認指向"/sbin/modprobe"，修改modprobe_path指向"/tmp/w"，然后再執行一個非法的二進制文件，這樣便會觸發"/tmp/w"這個文件以root權限執行，從而拿到root權限。

提權原理：篡改modprobe_path提權的原理，想必大家也不陌生，這里還是簡單介紹一下，當execve函數執行一個非法的二進制文件時，執行到search_binary_handler()函數時，會遍歷formats鏈表，formats鏈表包含所有注冊的二進制文件，挨個調用load_elf_binary()函數，判斷當前執行文件格式是否是注冊的二進制文件，如果不是注冊的二進制文件，再調printable宏判斷當前執行文件前4個字節是否是可打印的字符，如果當前執行文件既不是注冊的二進制文件，前4個字節也不是可打印的字符，則調用request_module()函數。

static int search_binary_handler(struct linux_binprm *bprm)
{
 bool need_retry = IS_ENABLED(CONFIG_MODULES);
 struct linux_binfmt *fmt;
 int retval;

 retval = prepare_binprm(bprm);
 if (retval < 0)
  return retval;

 retval = security_bprm_check(bprm);
 if (retval)
  return retval;

 retval = -ENOENT;
 retry:
 read_lock(&binfmt_lock);
 list_for_each_entry(fmt, &formats, lh) { // 遍歷注冊了二進制格式的formats鏈表
  if (!try_module_get(fmt->module))
   continue;
  read_unlock(&binfmt_lock);

  retval = fmt->load_binary(bprm);  // 檢查二進制文件

  read_lock(&binfmt_lock);
  put_binfmt(fmt);
  if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
   read_unlock(&binfmt_lock);
   return retval;
  }
 }
 read_unlock(&binfmt_lock);

 if (need_retry) {
  if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
      printable(bprm->buf[2]) && printable(bprm->buf[3]))  // 檢查是否是打印字符
   return retval;
  if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
   return retval;
  need_retry = false;
  goto retry;
 }

 return retval;
}

request_module()函數是__request_module()的宏定義。

#define request_module(mod...) __request_module(true, mod)

__request_module()函數是一個嘗試加載內核模塊的函數，主要調用call_modprobe()，定義于kernel/kmod.c。

static int call_modprobe(char *module_name, int wait)
{
 struct subprocess_info *info;
 static char *envp[] = {
  "HOME=/",
  "TERM=linux",
  "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
  NULL
 };

 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
 if (!argv)
  goto out;

 module_name = kstrdup(module_name, GFP_KERNEL);
 if (!module_name)
  goto free_argv;

 argv[0] = modprobe_path;  // 是我們需要篡改的全局變量
 argv[1] = "-q";
 argv[2] = "--";
 argv[3] = module_name; /* check free_modprobe_argv() */
 argv[4] = NULL;

 info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
      NULL, free_modprobe_argv, NULL);
 if (!info)
  goto free_module_name;

 return call_usermodehelper_exec(info, wait | UMH_KILLABLE); 

free_module_name:
 kfree(module_name);
free_argv:
 kfree(argv);
out:
 return -ENOMEM;
}

call_usermodehelper_exec()函數將modprobe_path作為可執行程序路徑，以root權限執行，modprobe_path是一個全局變量，指向"/sbin/modprobe"。

/kernel/kmod.c

/*
 modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;

/init/Kconfig
config MODPROBE_PATH
 string "Path to modprobe binary"
 default "/sbin/modprobe"
 help
   When kernel code requests a module, it does so by calling
   the "modprobe" userspace utility. This option allows you to
   set the path where that binary is found. This can be changed
   at runtime via the sysctl file
   /proc/sys/kernel/modprobe. Setting this to the empty string
   removes the kernel's ability to request modules (but
   userspace can still load modules explicitly).

任意寫：在繞過KASLR后，就能計算出modprobe_path的地址，再通過修改msg_msg的成員變量next指向modprobe_path-8，再配合fuse用戶文件系統向msg_msg->next指向的msg_msgseg數據部分寫入我們自定義程序的字符串路徑，即完成任意寫。

篡改前：modprobe_path指向"/sbin/modprobe"

1661134889142

篡改后：modprobe_path指向"/tmp/w"

1661135078798

提權流程：

1.堆風水，先耗盡kmalloc-4096的空閑塊，然后布局8 PAGE的內存，也是通過ringbuf申請大量的8 PAGE大小的內存塊；

2.第一次堆噴，釋放掉偶數位下標的8 PAGE的ringbuf，然后用大量的pg_vec去堆噴kmalloc-4096大小的slab；

3.第二次堆噴，觸發first free釋放掉2個kmalloc-4096的pg_vec，然后先創建一個線程A，用2個大于PAGE_SIZE小于2 PAGE_SIZE的msg_msgA去堆噴占位剛釋放的兩個kmalloc-4096空閑塊，此時load_msg()在kmalloc完成后，會因為在copy_from_user的時候，觸發fuse文件系統的讀函數，通過讀pipe數據而使線程A阻塞。

4.第三次堆噴，然后再創建第二線程B，繼續釋放剛才被first free的2個kmalloc-4096的pg_vec內存，觸發double free，再用1個大于PAGE_SIZE小于2 PAGE_SIZE的msg_msgB去堆噴這兩塊剛被回收的2個kmalloc-4096內存塊，用msg_msgsegB去篡改第二次堆噴中msg_msgA->next指針為modprobe-8，并通過pipe發送信號給第三步中阻塞的線程A，fuse read接受到信號后完成對msg_msgsegA內容的篡改，并返回，這樣線程A完成對modprobe_path指向字符串內容的篡改為我們自定義的"/tmp/w"。

1660287345065

5.最后執行一個非法的二進制文件，便能觸發我們自定義"/tmp/w"的執行，從而完成提權。

2.3.利用方式二：USMA(用戶態映射攻擊)

USMA簡介：USMA（User-Space-Mmaping-Attack）又稱作是用戶態映射攻擊，是360漏洞研究院的安全研究員提出的利用手法。

提權思路：利用packet漏洞模塊的packet_mmap函數能將漏洞對象pg_vec映射到用戶空間的這個特性，再利用double free的漏洞原理，將漏洞對象pg_vec篡改為內核代碼 __sys_setresuid內核函數的地址，這樣就能把__sys_setresuid內核函數的代碼映射到用戶空間，通過硬編碼改變代碼邏輯，即可讓普通用戶進程調用setresuid函數繞過權限檢查，修改cred提升權限。

/kernel/sys.c

/*
 * This function implements a generic ability to update ruid, euid,
 * and suid.  This allows you to implement the 4.4 compatible seteuid().
 */
long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{
 struct user_namespace *ns = current_user_ns();
 const struct cred *old;
 struct cred *new;
 int retval;
 kuid_t kruid, keuid, ksuid;

 kruid = make_kuid(ns, ruid);
 keuid = make_kuid(ns, euid);
 ksuid = make_kuid(ns, suid);

 if ((ruid != (uid_t) -1) && !uid_valid(kruid))
  return -EINVAL;

 if ((euid != (uid_t) -1) && !uid_valid(keuid))
  return -EINVAL;

 if ((suid != (uid_t) -1) && !uid_valid(ksuid))
  return -EINVAL;

 new = prepare_creds();
 if (!new)
  return -ENOMEM;

 old = current_cred();

 retval = -EPERM;
 //通過硬編碼修改，讓普通用戶調用setresuid()函數不會進入if判斷，從而修改cred提權
 if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
  if (ruid != (uid_t) -1        && !uid_eq(kruid, old->uid) &&
      !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
   goto error;
  if (euid != (uid_t) -1        && !uid_eq(keuid, old->uid) &&
      !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
   goto error;
  if (suid != (uid_t) -1        && !uid_eq(ksuid, old->uid) &&
      !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
   goto error;
 }

 if (ruid != (uid_t) -1) {
  new->uid = kruid;
  if (!uid_eq(kruid, old->uid)) {
   retval = set_user(new);
   if (retval < 0)
    goto error;
  }
 }
 if (euid != (uid_t) -1)
  new->euid = keuid;
 if (suid != (uid_t) -1)
  new->suid = ksuid;
 new->fsuid = new->euid;

 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
 if (retval < 0)
  goto error;

 return commit_creds(new);

error:
 abort_creds(new);
 return retval;
}

SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
{
 return __sys_setresuid(ruid, euid, suid);
}

映射原理：packet_mmap函數通過對當前套接字對應的pg_vec數組里buffer映射到用戶層，可以讓用戶態修改并同步內核態的內存。

static int packet_mmap(struct file *file, struct socket *sock,
  struct vm_area_struct *vma)
{
 struct sock *sk = sock->sk;
 struct packet_sock *po = pkt_sk(sk);
 unsigned long size, expected_size;
 struct packet_ring_buffer *rb;
 unsigned long start;
 int err = -EINVAL;
 int i;

 if (vma->vm_pgoff)
  return -EINVAL;

 mutex_lock(&po->pg_vec_lock);

 expected_size = 0;
 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
  if (rb->pg_vec) {
 // 計算當前套接字對應ringbuf所有大小的和，間接等于ring buf的block_nr * block_size。
   expected_size += rb->pg_vec_len  // 等于block_nr
      * rb->pg_vec_pages  // 等于block_size/PAGE_SIZE
      * PAGE_SIZE;
  }
 }

 if (expected_size == 0)
  goto out;

 size = vma->vm_end - vma->vm_start;  // 用戶層映射內存大小
 if (size != expected_size)
  goto out;

 start = vma->vm_start;    // 用戶層映射內存起始地址
 for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { //目前就一個ring buf
  if (rb->pg_vec == NULL)
   continue;

  for (i = 0; i < rb->pg_vec_len; i++) { // 循環block_nr次
   struct page *page;
   void *kaddr = rb->pg_vec[i].buffer; // kaddr地址基本都是頁對齊的
   int pg_num;
   // 循環block_size/PAGE_SIZE次
   for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
    page = pgv_to_page(kaddr);
    // 映射的主要函數，通過該函數將pg_vec數組里buffer映射到用戶層
    err = vm_insert_page(vma, start, page); 
    if (unlikely(err))
     goto out;
    start += PAGE_SIZE;
    kaddr += PAGE_SIZE;
   }
  }
 }

 atomic_inc(&po->mapped);
 vma->vm_ops = &packet_mmap_ops;
 err = 0;

out:
 mutex_unlock(&po->pg_vec_lock);
 return err;
}

正如360的USMA的描述，在vm_insert_page()函數里調用了validate_page_before_insert()函數做頁檢查，validate_page_before_insert()函數對映射的pg_vec數組里的buffer所屬page的類型進行了判斷，過濾了匿名頁、屬于slab對象的頁、屬于buddy系統的頁、屬于交換內存的頁、屬于分頁管理中頁表的頁、屬于內存屏障的頁，以上頁類型都不能映射，恰好我們要映射的是內核代碼段，是可以映射到用戶態的。

/mm/memory.c

static int validate_page_before_insert(struct page *page)
{
 if (PageAnon(page) || PageSlab(page) || page_has_type(page))
  return -EINVAL;
 flush_dcache_page(page);
 return 0;
}

硬編碼篡改： __sys_setresuid函數被映射到用戶態后，讀取一個PAGE_SIZE大小的內核內存

考慮到需要篡改call ns_capable_setid調用之后的判斷，對test al,al jnz short loc_FFFFFFFF810BE1C4的匯編作一番篡改，最簡單的方法就是將jnz/jne改為jz/je，由機器碼，0x75改為0x74，由于映射的內存范圍很大，所以我將0x84 0xC0 0x75 0x59作為特征進行搜索定位。

1660702474261

這段機器碼由0x84 0xC0 0x75 0x59變為0x84 0xC0 0x74 0x59，jne變為je。

篡改前：

1660703547438

篡改后：

1660703431605

提權：經過上述對內核函數__sys_setresuid的篡改，再通過調用setresuid(0,0,0);即可將普通用戶進程提權至root用戶權限。

3.總結

上述兩種提權方式，經過實現與調試，篡改modprobe_path提權和USMA(用戶態映射攻擊)兩者都是通過任意寫完成的提權，不用一堆gadget，相比ROP的提權方式而言，適配效率更高，限制更小，讓任意寫提權相對顯得更加"高大上"。篡改modprobe_path提權相比于USMA利用，前者相較而言更加通用。

Paper 本文由 Seebug Paper 發布，如需轉載請注明來源。本文地址：http://www.bjnorthway.com/1952/

Paper - 安全技術精粹