Browse Source

Fix disallowed cr0 write protection and close_fd (#80)

Since the commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 [1]
(kernel version v5.3+ [2]) the sensitive CR0 bits in x86 is pinned,
we need to use the inline asm [3][4] to bypass it.

commit 8dbec27a242cd3e2816eeb98d3237b9f57cf6232 :
> With sensitive CR4 bits pinned now, it's possible that the WP bit for
> CR0 might become a target as well.
>
> Following the same reasoning for the CR4 pinning, pin CR0's WP
> bit. Contrary to the cpu feature dependend CR4 pinning this can be done
> with a constant value.

Also, getting "sys_call_table" [8] from the symbol lookup by using the address
of "close_fd" does not work for v5.11+ [5][6]. The reason is the entry of
"sys_call_table[__NR_close]" is not the address of "close_fd", actually
it is "__x64_sys_close" in x86.

Two solutions were proposed: using "kallsyms_lookup_name" [7] or just specifying
the address into the module. The symbol "kallsyms_lookup_name"  is unexported
since v5.7; the address of "sys_call_table" can be found in
"/boot/System.map" or "/proc/kallsyms".

Since v5.7, the manual symbol lookup is not guaranteed to work
because of control-flow integrity (or control-flow enforcement [9][10]) is added
[11] for x86, but it is disabled since v5.11 [12][13]. To make sure manual symbol
lookup work, it only uses up to v5.4.

Reference:
[1] https://github.com/torvalds/linux/commit/8dbec27a242cd3e2816eeb98d3237b9f57cf6232
[2] https://outflux.net/blog/archives/2019/11/14/security-things-in-linux-v5-3/
[3] https://patchwork.kernel.org/project/linux-kbuild/patch/20200903203053.3411268-3-samitolvanen@google.com/
[4] https://stackoverflow.com/questions/58512430/how-to-write-to-protected-pages-in-the-linux-kernel
[5] https://lore.kernel.org/bpf/20201120231441.29911-21-ebiederm@xmission.com/
[6] https://lore.kernel.org/bpf/87blj83ysq.fsf@x220.int.ebiederm.org/
[7] https://github.com/torvalds/linux/commit/0bd476e6c67190b5eb7b6e105c8db8ff61103281
[8] https://github.com/torvalds/linux/commit/8f27766a883149926e7c1f69d9f1d8f68efcd65f
[9] https://lore.kernel.org/lkml/20200204171425.28073-1-yu-cheng.yu@intel.com/
[10] https://lore.kernel.org/linux-doc/20201110162211.9207-1-yu-cheng.yu@intel.com/T/
[11] https://github.com/torvalds/linux/commit/5790921bc18b1eb5c0c61371e31114fd4c4b0154
[12] https://github.com/torvalds/linux/commit/20bf2b378729c4a0366a53e2018a0b70ace94bcd
[13] https://lore.kernel.org/bpf/20210128123842.c9e33949e62f504b84bfadf5@gmail.com/
linD026 3 năm trước cách đây
mục cha
commit
cccc98ab2c
2 tập tin đã thay đổi với 193 bổ sung12 xóa
  1. 99 12
      examples/syscall.c
  2. 94 0
      lkmpg.tex

+ 99 - 12
examples/syscall.c

@@ -23,17 +23,44 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 
+
+/* The way we access "sys_call_table" varies as kernel internal changes.
+ * - ver <= 5.4 : manual symbol lookup
+ * - 5.4 < ver < 5.7 : kallsyms_lookup_name
+ * - 5.7 <= ver : Kprobes or specific kernel module parameter
+ */
+
 /* The in-kernel calls to the ksys_close() syscall were removed in Linux v5.11+.
  */
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0))
-#include <linux/syscalls.h> /* ksys_close() wrapper for backward compatibility */
-#define close_fd ksys_close
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0))
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(5, 4, 0)
+#define HAVE_KSYS_CLOSE 1
+#include <linux/syscalls.h> /* For ksys_close() */
 #else
-#include <linux/fdtable.h> /* For close_fd */
+#include <linux/kallsyms.h> /* For kallsyms_lookup_name */
+#endif
+
+#else
+
+#if defined(CONFIG_KPROBES)
+#define HAVE_KPROBES 1
+#include <linux/kprobes.h>
+#else
+#define HAVE_PARAM 1
+#include <linux/kallsyms.h> /* For sprint_symbol */
+/* The address of the sys_call_table, which can be obtained with looking up
+ * "/boot/System.map" or "/proc/kallsyms". When the kernel version is v5.7+,
+ * without CONFIG_KPROBES, you can input the parameter or the module will look
+ * up all the memory.
+ */
+static unsigned long sym = 0;
+module_param(sym, ulong, 0644);
+#endif
+
 #endif
 
 unsigned long **sys_call_table;
-unsigned long original_cr0;
 
 /* UID we want to spy on - will be filled from the command line. */
 static int uid;
@@ -83,19 +110,81 @@ asmlinkage int our_sys_open(const char *filename, int flags, int mode)
 
 static unsigned long **aquire_sys_call_table(void)
 {
+#ifdef HAVE_KSYS_CLOSE
     unsigned long int offset = PAGE_OFFSET;
     unsigned long **sct;
 
     while (offset < ULLONG_MAX) {
         sct = (unsigned long **) offset;
 
-        if (sct[__NR_close] == (unsigned long *) close_fd)
+        if (sct[__NR_close] == (unsigned long *) ksys_close)
             return sct;
 
         offset += sizeof(void *);
     }
 
     return NULL;
+#endif
+
+#ifdef HAVE_PARAM
+    const char sct_name[15] = "sys_call_table";
+    char symbol[40] = {0};
+
+    if (sym == 0) {
+        pr_alert(
+            "For Linux v5.7+, Kprobes is the preferable way to get "
+            "symbol.\n");
+        pr_info(
+            "If Kprobes is absent, you have to specify the address of "
+            "sys_call_table symbol\n");
+        pr_info(
+            "by /boot/System.map or /proc/kallsyms, which contains all the "
+            "symbol addresses, into sym parameter.\n");
+        return NULL;
+    }
+    sprint_symbol(symbol, sym);
+    if (!strncmp(sct_name, symbol, sizeof(sct_name) - 1))
+        return (unsigned long **) sym;
+
+    return NULL;
+#endif
+
+#ifdef HAVE_KPROBES
+    unsigned long (*kallsyms_lookup_name)(const char *name);
+    struct kprobe kp = {
+        .symbol_name = "kallsyms_lookup_name",
+    };
+
+    if (register_kprobe(&kp) < 0)
+        return NULL;
+    kallsyms_lookup_name = (unsigned long (*)(const char *name)) kp.addr;
+    unregister_kprobe(&kp);
+#endif
+
+    return (unsigned long **) kallsyms_lookup_name("sys_call_table");
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0)
+static inline void __write_cr0(unsigned long cr0)
+{
+    asm volatile("mov %0,%%cr0" : "+r"(cr0) : : "memory");
+}
+#else
+#define __write_cr0 write_cr0
+#endif
+
+static void enable_write_protection(void)
+{
+    unsigned long cr0 = read_cr0();
+    set_bit(16, &cr0);
+    __write_cr0(cr0);
+}
+
+static void disable_write_protection(void)
+{
+    unsigned long cr0 = read_cr0();
+    clear_bit(16, &cr0);
+    __write_cr0(cr0);
 }
 
 static int __init syscall_start(void)
@@ -103,9 +192,7 @@ static int __init syscall_start(void)
     if (!(sys_call_table = aquire_sys_call_table()))
         return -1;
 
-    original_cr0 = read_cr0();
-
-    write_cr0(original_cr0 & ~0x00010000);
+    disable_write_protection();
 
     /* keep track of the original open function */
     original_call = (void *) sys_call_table[__NR_open];
@@ -113,7 +200,7 @@ static int __init syscall_start(void)
     /* use our open function instead */
     sys_call_table[__NR_open] = (unsigned long *) our_sys_open;
 
-    write_cr0(original_cr0);
+    enable_write_protection();
 
     pr_info("Spying on UID:%d\n", uid);
 
@@ -133,9 +220,9 @@ static void __exit syscall_end(void)
         pr_alert("an unstable state.\n");
     }
 
-    write_cr0(original_cr0 & ~0x00010000);
+    disable_write_protection();
     sys_call_table[__NR_open] = (unsigned long *) original_call;
-    write_cr0(original_cr0);
+    enable_write_protection();
 
     msleep(2000);
 }

+ 94 - 0
lkmpg.tex

@@ -1204,6 +1204,100 @@ If you want to read this code, it is at the source file \verb|arch/$(architectur
 So, if we want to change the way a certain system call works, what we need to do is to write our own function to implement it (usually by adding a bit of our own code, and then calling the original function) and then change the pointer at \cpp|sys_call_table| to point to our function.
 Because we might be removed later and we don't want to leave the system in an unstable state, it's important for \cpp|cleanup_module| to restore the table to its original state.
 
+To modify the content of \cpp|sys_call_table|, we need to consider the control register.
+A control register is a processor register that changes or controls the general behavior of the CPU.
+For x86 architecture, the \verb|cr0| register has various control flags that modify the basic operation of the processor.
+The \verb|WP| flag in \verb|cr0| stands for write protection.
+Once the \verb|WP| flag is set, the processor disallows further write attempts to the read-only sections
+Therefore, we must disable the \verb|WP| flag before modifying \cpp|sys_call_table|.
+Since Linux v5.3, the \cpp|write_cr0| function cannot be used because of the sensitive \verb|cr0| bits pinned by the security issue, the attacker may write into CPU control registers to disable CPU protections like write protection.
+As a result, we have to provide the custom assembly routine to bypass it.
+
+However, \cpp|sys_call_table| symbol is unexported to prevent misuse.
+But there have few ways to get the symbol, manual symbol lookup and \cpp|kallsyms_lookup_name|.
+Here we use both depend on the kernel version.
+
+Because of the \textit{control-flow integrity}, which is a technique to prevent the redirect execution code from the attacker, for making sure that the indirect calls go to the expected addresses and the return addresses are not changed.
+Since Linux v5.7, the kernel patched the series of \textit{control-flow enforcement} (CET) for x86, and some configurations of GCC, like GCC versions 9 and 10 in Ubuntu, will add with CET (the \verb|-fcf-protection| option) in the kernel by default.
+Using that GCC to compile the kernel with retpoline off may result in CET being enabled in the kernel.
+You can use the following command to check out the \verb|-fcf-protection| option is enabled or not:
+\begin{verbatim}
+$ gcc -v -Q -O2 --help=target | grep protection
+Using built-in specs.
+COLLECT_GCC=gcc
+COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/9/lto-wrapper
+...
+gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04)
+COLLECT_GCC_OPTIONS='-v' '-Q' '-O2' '--help=target' '-mtune=generic' '-march=x86-64'
+ /usr/lib/gcc/x86_64-linux-gnu/9/cc1 -v ... -fcf-protection ...
+ GNU C17 (Ubuntu 9.3.0-17ubuntu1~20.04) version 9.3.0 (x86_64-linux-gnu)
+...
+\end{verbatim}
+But CET should not be enabled in the kernel, it may break the Kprobes and bpf.
+Consequently, CET is disabled since v.11.
+To guarantee the manual symbol lookup worked, we only use up to v5.4.
+
+Unfortunately, since Linux v5.7 \cpp|kallsyms_lookup_name| is also unexported, it needs certain trick to get the address of \cpp|kallsyms_lookup_name|.
+If \cpp|CONFIG_KPROBES| is enabled, we can facilitate the retrieval of function addresses by means of Kprobes to dynamically break into the specific kernel routine.
+Kprobes inserts a breakpoint at the entry of function by replacing the first bytes of the probed instruction.
+When a CPU hits the breakpoint, registers are stored, and the control will pass to Kprobes.
+It passes the addresses of the saved registers and the Kprobe struct to the handler you defined, then executes it.
+Kprobes can be registered by symbol name or address.
+Within the symbol name, the address will be handled by the kernel.
+
+Otherwise, specify the address of \cpp|sys_call_table| from \verb|/proc/kallsyms| and \verb|/boot/System.map| into \cpp|sym| parameter.
+Following is the sample usage for \verb|/proc/kallsyms|:
+\begin{verbatim}
+$ sudo grep sys_call_table /proc/kallsyms
+ffffffff82000280 R x32_sys_call_table
+ffffffff820013a0 R sys_call_table
+ffffffff820023e0 R ia32_sys_call_table
+$ sudo insmod syscall.ko sym=0xffffffff820013a0
+\end{verbatim}
+
+Using the address from \verb|/boot/System.map|, be careful about \verb|KASLR| (Kernel Address Space Layout Randomization).
+\verb|KASLR| may randomize the address of kernel code and data at every boot time, such as the static address listed in \verb|/boot/System.map| will offset by some entropy.
+The purpose of \verb|KASLR| is to protect the kernel space from the attacker.
+Without \verb|KASLR|, the attacker may find the target address in the fixed address easily.
+Then the attacker can use return-oriented programming to insert some malicious codes to execute or receive the target data by a tampered pointer.
+\verb|KASLR| mitigates these kinds of attacks because the attacker cannot immediately know the target address, but a brute-force attack can still work.
+If the address of a symbol in \verb|/proc/kallsyms| is different from the address in \verb|/boot/System.map|, \verb|KASLR| is enabled with the kernel, which your system running on.
+\begin{verbatim}
+$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub
+GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
+$ sudo grep sys_call_table /boot/System.map-$(uname -r)
+ffffffff82000300 R sys_call_table
+$ sudo grep sys_call_table /proc/kallsyms
+ffffffff820013a0 R sys_call_table
+# Reboot
+$ sudo grep sys_call_table /boot/System.map-$(uname -r)
+ffffffff82000300 R sys_call_table
+$ sudo grep sys_call_table /proc/kallsyms 
+ffffffff86400300 R sys_call_table
+\end{verbatim}
+If \verb|KASLR| is enabled, we have to take care of the address from \verb|/proc/kallsyms| each time we reboot the machine.
+In order to use the address from \verb|/boot/System.map|, make sure that \verb|KASLR| is disabled.
+You can add the \verb|nokaslr| for disabling \verb|KASLR| in next booting time:
+\begin{verbatim}
+$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub
+GRUB_CMDLINE_LINUX_DEFAULT="quiet splash"
+$ sudo perl -i -pe 'm/quiet/ and s//quiet nokaslr/' /etc/default/grub
+$ grep quiet /etc/default/grub
+GRUB_CMDLINE_LINUX_DEFAULT="quiet nokaslr splash"
+$ sudo update-grub
+\end{verbatim}
+
+For more information, check out the following:
+
+\begin{itemize}
+ \item \href{https://lwn.net/Articles/804849/}{Cook: Security things in Linux v5.3}
+ \item \href{https://lwn.net/Articles/12211/}{Unexporting the system call table}
+ \item \href{https://lwn.net/Articles/810077/}{Control-flow integrity for the kernel}
+ \item \href{https://lwn.net/Articles/813350/}{Unexporting kallsyms\_lookup\_name()}
+ \item \href{https://www.kernel.org/doc/Documentation/kprobes.txt}{Kernel Probes (Kprobes)}
+ \item \href{https://lwn.net/Articles/569635/}{Kernel address space layout randomization}
+\end{itemize}
+
 The source code here is an example of such a kernel module.
 We want to ``spy'' on a certain user, and to \cpp|pr_info()| a message whenever that user opens a file.
 Towards this end, we replace the system call to open a file with our own function, called \cpp|our_sys_open|.