From 02101c45ec5b19d607af7372680f5259050b4e9c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 8 Aug 2018 17:22:16 -0400 Subject: [PATCH 01/12] x86/asm: Optimize memcpy_flushcache() I use memcpy_flushcache() in my persistent memory driver for metadata updates, there are many 8-byte and 16-byte updates and it turns out that the overhead of memcpy_flushcache causes 2% performance degradation compared to "movnti" instruction explicitly coded using inline assembler. The tests were done on a Skylake processor with persistent memory emulated using the "memmap" kernel parameter. dd was used to copy data to the dm-writecache target. This patch recognizes memcpy_flushcache calls with constant short length and turns them into inline assembler - so that I don't have to use inline assembler in the driver. Signed-off-by: Mikulas Patocka Cc: Dan Williams Cc: Linus Torvalds Cc: Mike Snitzer Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: device-mapper development Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1808081720460.24747@file01.intranet.prod.int.rdu2.redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/string_64.h | 20 +++++++++++++++++++- arch/x86/lib/usercopy_64.c | 4 ++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index d33f92b9fa22..7ad41bfcc16c 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -149,7 +149,25 @@ memcpy_mcsafe(void *dst, const void *src, size_t cnt) #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE #define __HAVE_ARCH_MEMCPY_FLUSHCACHE 1 -void memcpy_flushcache(void *dst, const void *src, size_t cnt); +void __memcpy_flushcache(void *dst, const void *src, size_t cnt); +static __always_inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + if (__builtin_constant_p(cnt)) { + switch (cnt) { + case 4: + asm ("movntil %1, %0" : "=m"(*(u32 *)dst) : "r"(*(u32 *)src)); + return; + case 8: + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); + return; + case 16: + asm ("movntiq %1, %0" : "=m"(*(u64 *)dst) : "r"(*(u64 *)src)); + asm ("movntiq %1, %0" : "=m"(*(u64 *)(dst + 8)) : "r"(*(u64 *)(src + 8))); + return; + } + } + __memcpy_flushcache(dst, src, cnt); +} #endif #endif /* __KERNEL__ */ diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 9c5606d88f61..c50a1d815a37 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -153,7 +153,7 @@ long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) return rc; } -void memcpy_flushcache(void *_dst, const void *_src, size_t size) +void __memcpy_flushcache(void *_dst, const void *_src, size_t size) { unsigned long dest = (unsigned long) _dst; unsigned long source = (unsigned long) _src; @@ -216,7 +216,7 @@ void memcpy_flushcache(void *_dst, const void *_src, size_t size) clean_cache_range((void *) dest, size); } } -EXPORT_SYMBOL_GPL(memcpy_flushcache); +EXPORT_SYMBOL_GPL(__memcpy_flushcache); void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len) From c808c09b527cd60d9a0d53799935f75e2452174d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 28 Sep 2018 10:33:05 +0200 Subject: [PATCH 02/12] x86/asm: Use CC_SET()/CC_OUT() in __cmpxchg_double() Replace open-coded use of the SETcc instruction with CC_SET()/CC_OUT() in __cmpxchg_double(). Signed-off-by: Uros Bizjak Signed-off-by: Borislav Petkov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/CAFULd4YdvwwhXWHqqPsGk5+TLG71ozgSscTZNsqmrm+Jzg941w@mail.gmail.com --- arch/x86/include/asm/cmpxchg.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index a55d79b233d3..bfb85e5844ab 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -242,10 +242,12 @@ extern void __add_wrong_size(void) BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ - asm volatile(pfx "cmpxchg%c4b %2; sete %0" \ - : "=a" (__ret), "+d" (__old2), \ - "+m" (*(p1)), "+m" (*(p2)) \ - : "i" (2 * sizeof(long)), "a" (__old1), \ + asm volatile(pfx "cmpxchg%c5b %1" \ + CC_SET(e) \ + : CC_OUT(e) (__ret), \ + "+m" (*(p1)), "+m" (*(p2)), \ + "+a" (__old1), "+d" (__old2) \ + : "i" (2 * sizeof(long)), \ "b" (__new1), "c" (__new2)); \ __ret; \ }) From 07e1d88adaaeab247b300926f78cc3f950dbeda3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 18 Sep 2018 16:08:52 -0700 Subject: [PATCH 03/12] x86/fsgsbase/64: Fix ptrace() to read the FS/GS base accurately On 64-bit kernels ptrace can read the FS/GS base using the register access APIs (PTRACE_PEEKUSER, etc.) or PTRACE_ARCH_PRCTL. Make both of these mechanisms return the actual FS/GS base. This will improve debuggability by providing the correct information to ptracer such as GDB. [ chang: Rebased and revised patch description. ] [ mingo: Revised the changelog some more. ] Signed-off-by: Andy Lutomirski Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537312139-5580-2-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 62 +++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e2ee403865eb..3acbf45cb7fb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "tls.h" @@ -342,6 +343,49 @@ static int set_segment_reg(struct task_struct *task, return 0; } +static unsigned long task_seg_base(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -435,18 +479,16 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.fsbase; + if (task->thread.fsindex == 0) + return task->thread.fsbase; + else + return task_seg_base(task, task->thread.fsindex); } case offsetof(struct user_regs_struct, gs_base): { - /* - * XXX: This will not behave as expected if called on - * current or if fsindex != 0. - */ - return task->thread.gsbase; + if (task->thread.gsindex == 0) + return task->thread.gsbase; + else + return task_seg_base(task, task->thread.gsindex); } #endif } From b1378a561fd16afdd96ef0bc912b1bcd2b85a68e Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:53 -0700 Subject: [PATCH 04/12] x86/fsgsbase/64: Introduce FS/GS base helper functions Introduce FS/GS base access functionality via , not yet used by anything directly. Factor out task_seg_base() from x86/ptrace.c and rename it to x86_fsgsbase_read_task() to make it part of the new helpers. This will allow us to enhance FSGSBASE support and eventually enable the FSBASE/GSBASE instructions. An "inactive" GS base refers to a base saved at kernel entry and being part of an inactive, non-running/stopped user-task. (The typical ptrace model.) Here are the new functions: x86_fsbase_read_task() x86_gsbase_read_task() x86_fsbase_write_task() x86_gsbase_write_task() x86_fsbase_read_cpu() x86_fsbase_write_cpu() x86_gsbase_read_cpu_inactive() x86_gsbase_write_cpu_inactive() As an advantage of the unified namespace we can now see all FS/GSBASE API use in the kernel via the following 'git grep' pattern: $ git grep x86_.*sbase [ mingo: Wrote new changelog. ] Based-on-code-from: Andy Lutomirski Suggested-by: Ingo Molnar Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537312139-5580-3-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fsgsbase.h | 50 +++++++++++++ arch/x86/kernel/process_64.c | 124 ++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 51 ++----------- 3 files changed, 179 insertions(+), 46 deletions(-) create mode 100644 arch/x86/include/asm/fsgsbase.h diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h new file mode 100644 index 000000000000..1ab465ee23fe --- /dev/null +++ b/arch/x86/include/asm/fsgsbase.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_FSGSBASE_H +#define _ASM_FSGSBASE_H 1 + +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_X86_64 + +#include + +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector); + +/* + * Read/write a task's fsbase or gsbase. This returns the value that + * the FS/GS base would have (if the task were to be resumed). These + * work on current or on a different non-running task. + */ +unsigned long x86_fsbase_read_task(struct task_struct *task); +unsigned long x86_gsbase_read_task(struct task_struct *task); +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); + +/* Helper functions for reading/writing FS/GS base */ + +static inline unsigned long x86_fsbase_read_cpu(void) +{ + unsigned long fsbase; + + rdmsrl(MSR_FS_BASE, fsbase); + return fsbase; +} + +void x86_fsbase_write_cpu(unsigned long fsbase); + +static inline unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase); + +#endif /* CONFIG_X86_64 */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_FSGSBASE_H */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ea5ea850348d..2a53ff8d1baf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -54,6 +54,7 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include @@ -286,6 +287,129 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) +{ + unsigned short idx = selector >> 3; + unsigned long base; + + if (likely((selector & SEGMENT_TI_MASK) == 0)) { + if (unlikely(idx >= GDT_ENTRIES)) + return 0; + + /* + * There are no user segments in the GDT with nonzero bases + * other than the TLS segments. + */ + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return 0; + + idx -= GDT_ENTRY_TLS_MIN; + base = get_desc_base(&task->thread.tls_array[idx]); + } else { +#ifdef CONFIG_MODIFY_LDT_SYSCALL + struct ldt_struct *ldt; + + /* + * If performance here mattered, we could protect the LDT + * with RCU. This is a slow path, though, so we can just + * take the mutex. + */ + mutex_lock(&task->mm->context.lock); + ldt = task->mm->context.ldt; + if (unlikely(idx >= ldt->nr_entries)) + base = 0; + else + base = get_desc_base(ldt->entries + idx); + mutex_unlock(&task->mm->context.lock); +#else + base = 0; +#endif + } + + return base; +} + +void x86_fsbase_write_cpu(unsigned long fsbase) +{ + /* + * Set the selector to 0 as a notion, that the segment base is + * overwritten, which will be checked for skipping the segment load + * during context switch. + */ + loadseg(FS, 0); + wrmsrl(MSR_FS_BASE, fsbase); +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + /* Set the selector to 0 for the same reason as %fs above. */ + loadseg(GS, 0); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); +} + +unsigned long x86_fsbase_read_task(struct task_struct *task) +{ + unsigned long fsbase; + + if (task == current) + fsbase = x86_fsbase_read_cpu(); + else if (task->thread.fsindex == 0) + fsbase = task->thread.fsbase; + else + fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); + + return fsbase; +} + +unsigned long x86_gsbase_read_task(struct task_struct *task) +{ + unsigned long gsbase; + + if (task == current) + gsbase = x86_gsbase_read_cpu_inactive(); + else if (task->thread.gsindex == 0) + gsbase = task->thread.gsbase; + else + gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); + + return gsbase; +} + +int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) +{ + /* + * Not strictly needed for %fs, but do it for symmetry + * with %gs + */ + if (unlikely(fsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.fsbase = fsbase; + if (task == current) + x86_fsbase_write_cpu(fsbase); + task->thread.fsindex = 0; + preempt_enable(); + + return 0; +} + +int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) +{ + if (unlikely(gsbase >= TASK_SIZE_MAX)) + return -EPERM; + + preempt_disable(); + task->thread.gsbase = gsbase; + if (task == current) + x86_gsbase_write_cpu_inactive(gsbase); + task->thread.gsindex = 0; + preempt_enable(); + + return 0; +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 3acbf45cb7fb..fbde2a7ce377 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include "tls.h" @@ -343,49 +343,6 @@ static int set_segment_reg(struct task_struct *task, return 0; } -static unsigned long task_seg_base(struct task_struct *task, - unsigned short selector) -{ - unsigned short idx = selector >> 3; - unsigned long base; - - if (likely((selector & SEGMENT_TI_MASK) == 0)) { - if (unlikely(idx >= GDT_ENTRIES)) - return 0; - - /* - * There are no user segments in the GDT with nonzero bases - * other than the TLS segments. - */ - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return 0; - - idx -= GDT_ENTRY_TLS_MIN; - base = get_desc_base(&task->thread.tls_array[idx]); - } else { -#ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; - - /* - * If performance here mattered, we could protect the LDT - * with RCU. This is a slow path, though, so we can just - * take the mutex. - */ - mutex_lock(&task->mm->context.lock); - ldt = task->mm->context.ldt; - if (unlikely(idx >= ldt->nr_entries)) - base = 0; - else - base = get_desc_base(ldt->entries + idx); - mutex_unlock(&task->mm->context.lock); -#else - base = 0; -#endif - } - - return base; -} - #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) @@ -482,13 +439,15 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) if (task->thread.fsindex == 0) return task->thread.fsbase; else - return task_seg_base(task, task->thread.fsindex); + return x86_fsgsbase_read_task(task, + task->thread.fsindex); } case offsetof(struct user_regs_struct, gs_base): { if (task->thread.gsindex == 0) return task->thread.gsbase; else - return task_seg_base(task, task->thread.gsindex); + return x86_fsgsbase_read_task(task, + task->thread.gsindex); } #endif } From e696c231bebf5f17fe0c5e465c01511320668054 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:54 -0700 Subject: [PATCH 05/12] x86/fsgsbase/64: Make ptrace use the new FS/GS base helpers Use the new FS/GS base helper functions in in the platform specific ptrace implementation of the following APIs: PTRACE_ARCH_PRCTL, PTRACE_SETREG, PTRACE_GETREG, etc. The fsgsbase code is more abstracted out this way and the FS/GS-update mechanism will be easier to change this way. [ mingo: Wrote new changelog. ] Based-on-code-from: Andy Lutomirski Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537312139-5580-4-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fsgsbase.h | 3 -- arch/x86/kernel/process_64.c | 49 +++++++-------------------------- arch/x86/kernel/ptrace.c | 27 ++++++------------ 3 files changed, 18 insertions(+), 61 deletions(-) diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 1ab465ee23fe..5e9cbcce318a 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -8,9 +8,6 @@ #include -unsigned long x86_fsgsbase_read_task(struct task_struct *task, - unsigned short selector); - /* * Read/write a task's fsbase or gsbase. This returns the value that * the FS/GS base would have (if the task were to be resumed). These diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2a53ff8d1baf..e5fb0c3dee4d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -287,8 +287,8 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } -unsigned long x86_fsgsbase_read_task(struct task_struct *task, - unsigned short selector) +static unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; @@ -751,54 +751,25 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; - int doit = task == current; - int cpu; switch (option) { - case ARCH_SET_GS: - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.gsindex = 0; - task->thread.gsbase = arg2; - if (doit) { - load_gs_index(0); - ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2); - } - put_cpu(); + case ARCH_SET_GS: { + ret = x86_gsbase_write_task(task, arg2); break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (arg2 >= TASK_SIZE_MAX) - return -EPERM; - cpu = get_cpu(); - task->thread.fsindex = 0; - task->thread.fsbase = arg2; - if (doit) { - /* set the selector to 0 to not confuse __switch_to */ - loadsegment(fs, 0); - ret = wrmsrl_safe(MSR_FS_BASE, arg2); - } - put_cpu(); + } + case ARCH_SET_FS: { + ret = x86_fsbase_write_task(task, arg2); break; + } case ARCH_GET_FS: { - unsigned long base; + unsigned long base = x86_fsbase_read_task(task); - if (doit) - rdmsrl(MSR_FS_BASE, base); - else - base = task->thread.fsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { - unsigned long base; + unsigned long base = x86_gsbase_read_task(task); - if (doit) - rdmsrl(MSR_KERNEL_GS_BASE, base); - else - base = task->thread.gsbase; ret = put_user(base, (unsigned long __user *)arg2); break; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index fbde2a7ce377..d8f49c7384a3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -397,12 +397,11 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; /* - * When changing the segment base, use do_arch_prctl_64 - * to set either thread.fs or thread.fsindex and the - * corresponding GDT slot. + * When changing the FS base, use the same + * mechanism as for do_arch_prctl_64(). */ if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + return x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): /* @@ -411,7 +410,7 @@ static int putreg(struct task_struct *child, if (value >= TASK_SIZE_MAX) return -EIO; if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + return x86_gsbase_write_task(child, value); return 0; #endif } @@ -435,20 +434,10 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) return get_flags(task); #ifdef CONFIG_X86_64 - case offsetof(struct user_regs_struct, fs_base): { - if (task->thread.fsindex == 0) - return task->thread.fsbase; - else - return x86_fsgsbase_read_task(task, - task->thread.fsindex); - } - case offsetof(struct user_regs_struct, gs_base): { - if (task->thread.gsindex == 0) - return task->thread.gsbase; - else - return x86_fsgsbase_read_task(task, - task->thread.gsindex); - } + case offsetof(struct user_regs_struct, fs_base): + return x86_fsbase_read_task(task); + case offsetof(struct user_regs_struct, gs_base): + return x86_gsbase_read_task(task); #endif } From 824eea38d239fb2a6027e65e18a5daef23019b00 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:55 -0700 Subject: [PATCH 06/12] x86/fsgsbase/64: Convert the ELF core dump code to the new FSGSBASE helpers Replace open-coded rdmsr()'s with their API counterparts. No change in functionality intended. [ mingo: Wrote new changelog. ] Based-on-code-from: Andy Lutomirski Signed-off-by: Chang S. Bae Reviewed-by: Andi Kleen Reviewed-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Link: http://lkml.kernel.org/r/1537312139-5580-5-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/elf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 0d157d2a1e2a..1527ec351036 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -10,6 +10,7 @@ #include #include #include +#include typedef unsigned long elf_greg_t; @@ -205,7 +206,6 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ - unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -228,8 +228,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ - rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ + (pr_reg)[21] = x86_fsbase_read_cpu(); \ + (pr_reg)[22] = x86_gsbase_read_cpu_inactive(); \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ From f4550b52e495e1b634d1f2c1004bcea5dc3321ea Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:56 -0700 Subject: [PATCH 07/12] x86/fsgsbase/64: Factor out FS/GS segment loading from __switch_to() Instead of open coding the calls to load_seg_legacy(), introduce x86_fsgsbase_load() to load FS/GS segments. This makes it more explicit that this is part of FSGSBASE functionality, and the new helper can be updated when FSGSBASE instructions are enabled. [ mingo: Wrote new changelog. ] Signed-off-by: Chang S. Bae Reviewed-by: Andi Kleen Reviewed-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Link: http://lkml.kernel.org/r/1537312139-5580-6-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e5fb0c3dee4d..d6674a425714 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -287,6 +287,15 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } } +static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, + struct thread_struct *next) +{ + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); +} + static unsigned long x86_fsgsbase_read_task(struct task_struct *task, unsigned short selector) { @@ -597,10 +606,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + x86_fsgsbase_load(prev, next); switch_fpu_finish(next_fpu, cpu); From c4755613a1339ea77dbb15de75c9f74217209265 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:57 -0700 Subject: [PATCH 08/12] x86/segments/64: Rename the GDT PER_CPU entry to CPU_NUMBER The old 'per CPU' naming was misleading: 64-bit kernels don't use this GDT entry for per CPU data, but to store the CPU (and node) ID. [ mingo: Wrote new changelog. ] Suggested-by: H. Peter Anvin Signed-off-by: Chang S. Bae Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Link: http://lkml.kernel.org/r/1537312139-5580-7-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 2 +- arch/x86/include/asm/segment.h | 5 ++--- arch/x86/include/asm/vgtod.h | 8 ++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5b8b556dbb12..0b114aafcedc 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -359,7 +359,7 @@ static void vgetcpu_cpu_init(void *arg) d.p = 1; /* Present */ d.d = 1; /* 32-bit */ - write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, DESCTYPE_S); } static int vgetcpu_online(unsigned int cpu) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index e293c122d0d5..e3e788ea52e5 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -186,8 +186,7 @@ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -/* Abused to load per CPU data from limit */ -#define GDT_ENTRY_PER_CPU 15 +#define GDT_ENTRY_CPU_NUMBER 15 /* * Number of entries in the GDT table: @@ -207,7 +206,7 @@ #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) +#define __CPU_NUMBER_SEG (GDT_ENTRY_CPU_NUMBER*8 + 3) #endif diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 53748541c487..4e81ea920722 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -86,9 +86,9 @@ static inline unsigned int __getcpu(void) unsigned int p; /* - * Load per CPU data from GDT. LSL is faster than RDTSCP and - * works on all CPUs. This is volatile so that it orders - * correctly wrt barrier() and to keep gcc from cleverly + * Load CPU (and node) number from GDT. LSL is faster than RDTSCP + * and works on all CPUs. This is volatile so that it orders + * correctly with respect to barrier() and to keep GCC from cleverly * hoisting it out of the calling function. * * If RDPID is available, use it. @@ -96,7 +96,7 @@ static inline unsigned int __getcpu(void) alternative_io ("lsl %[seg],%[p]", ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); + [p] "=a" (p), [seg] "r" (__CPU_NUMBER_SEG)); return p; } From ffebbaedc8616cffe648202e364dce6a045d65a2 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:58 -0700 Subject: [PATCH 09/12] x86/vdso: Introduce helper functions for CPU and node number Clean up the CPU/node number related code a bit, to make it more apparent how we are encoding/extracting the CPU and node fields from the segment limit. No change in functionality intended. [ mingo: Wrote new changelog. ] Suggested-by: Andy Lutomirski Suggested-by: Thomas Gleixner Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Link: http://lkml.kernel.org/r/1537312139-5580-8-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vgetcpu.c | 9 +------- arch/x86/entry/vdso/vma.c | 21 +++++++---------- arch/x86/include/asm/segment.h | 41 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/vgtod.h | 26 --------------------- 4 files changed, 50 insertions(+), 47 deletions(-) diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index 8ec3d1f4ce9a..de78fc9cd963 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -13,14 +13,7 @@ notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - unsigned int p; - - p = __getcpu(); - - if (cpu) - *cpu = p & VGETCPU_CPU_MASK; - if (node) - *node = p >> 12; + vdso_read_cpu_node(cpu, node); return 0; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 0b114aafcedc..39b5584c5808 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -339,20 +339,15 @@ static void vgetcpu_cpu_init(void *arg) { int cpu = smp_processor_id(); struct desc_struct d = { }; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (static_cpu_has(X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); + unsigned long cpudata = vdso_encode_cpu_node(cpu, cpu_to_node(cpu)); + + if (static_cpu_has(X86_FEATURE_RDTSCP)) + write_rdtscp_aux(cpudata); + + /* Store CPU and node number in limit */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; - /* - * Store cpu number in limit so that it can be loaded - * quickly in user space in vgetcpu. (12 bits for the CPU - * and 8 bits for the node) - */ - d.limit0 = cpu | ((node & 0xf) << 12); - d.limit1 = node >> 4; d.type = 5; /* RO data, expand down, accessed */ d.dpl = 3; /* Visible to user code */ d.s = 1; /* Not a system segment */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index e3e788ea52e5..4d1f6cc62e13 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -224,6 +224,47 @@ #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) +#ifdef CONFIG_X86_64 + +/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ +#define VDSO_CPU_SIZE 12 +#define VDSO_CPU_MASK 0xfff + +#ifndef __ASSEMBLY__ + +/* Helper functions to store/load CPU and node numbers */ + +static inline unsigned long vdso_encode_cpu_node(int cpu, unsigned long node) +{ + return ((node << VDSO_CPU_SIZE) | cpu); +} + +static inline void vdso_read_cpu_node(unsigned *cpu, unsigned *node) +{ + unsigned int p; + + /* + * Load CPU and node number from GDT. LSL is faster than RDTSCP + * and works on all CPUs. This is volatile so that it orders + * correctly with respect to barrier() and to keep GCC from cleverly + * hoisting it out of the calling function. + * + * If RDPID is available, use it. + */ + alternative_io ("lsl %[seg],%[p]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__CPU_NUMBER_SEG)); + + if (cpu) + *cpu = (p & VDSO_CPU_MASK); + if (node) + *node = (p >> VDSO_CPU_SIZE); +} + +#endif /* !__ASSEMBLY__ */ +#endif /* CONFIG_X86_64 */ + #ifdef __KERNEL__ /* diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 4e81ea920722..056a61c8c5c7 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -77,30 +77,4 @@ static inline void gtod_write_end(struct vsyscall_gtod_data *s) ++s->seq; } -#ifdef CONFIG_X86_64 - -#define VGETCPU_CPU_MASK 0xfff - -static inline unsigned int __getcpu(void) -{ - unsigned int p; - - /* - * Load CPU (and node) number from GDT. LSL is faster than RDTSCP - * and works on all CPUs. This is volatile so that it orders - * correctly with respect to barrier() and to keep GCC from cleverly - * hoisting it out of the calling function. - * - * If RDPID is available, use it. - */ - alternative_io ("lsl %[seg],%[p]", - ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ - X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__CPU_NUMBER_SEG)); - - return p; -} - -#endif /* CONFIG_X86_64 */ - #endif /* _ASM_X86_VGTOD_H */ From b2e2ba578e016a091eb31565849990fe68c7c599 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 18 Sep 2018 16:08:59 -0700 Subject: [PATCH 10/12] x86/vdso: Initialize the CPU/node NR segment descriptor earlier Currently the CPU/node NR segment descriptor (GDT_ENTRY_CPU_NUMBER) is initialized relatively late during CPU init, from the vCPU code, which has a number of disadvantages, such as hotplug CPU notifiers and SMP cross-calls. Instead just initialize it much earlier, directly in cpu_init(). This reduces complexity and increases robustness. [ mingo: Wrote new changelog. ] Suggested-by: H. Peter Anvin Suggested-by: Andy Lutomirski Signed-off-by: Chang S. Bae Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1537312139-5580-9-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 33 +-------------------------------- arch/x86/kernel/cpu/common.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 39b5584c5808..3f9d43f26f63 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -332,35 +332,6 @@ static __init int vdso_setup(char *s) return 0; } __setup("vdso=", vdso_setup); -#endif - -#ifdef CONFIG_X86_64 -static void vgetcpu_cpu_init(void *arg) -{ - int cpu = smp_processor_id(); - struct desc_struct d = { }; - unsigned long cpudata = vdso_encode_cpu_node(cpu, cpu_to_node(cpu)); - - if (static_cpu_has(X86_FEATURE_RDTSCP)) - write_rdtscp_aux(cpudata); - - /* Store CPU and node number in limit */ - d.limit0 = cpudata; - d.limit1 = cpudata >> 16; - - d.type = 5; /* RO data, expand down, accessed */ - d.dpl = 3; /* Visible to user code */ - d.s = 1; /* Not a system segment */ - d.p = 1; /* Present */ - d.d = 1; /* 32-bit */ - - write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, DESCTYPE_S); -} - -static int vgetcpu_online(unsigned int cpu) -{ - return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); -} static int __init init_vdso(void) { @@ -370,9 +341,7 @@ static int __init init_vdso(void) init_vdso_image(&vdso_image_x32); #endif - /* notifier priority > KVM */ - return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, - "x86/vdso/vma:online", vgetcpu_online, NULL); + return 0; } subsys_initcall(init_vdso); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 44c4ef3d989b..a148d18a1ef0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1669,6 +1669,29 @@ static void wait_for_master_cpu(int cpu) #endif } +#ifdef CONFIG_X86_64 +static void setup_getcpu(int cpu) +{ + unsigned long cpudata = vdso_encode_cpu_node(cpu, early_cpu_to_node(cpu)); + struct desc_struct d = { }; + + if (static_cpu_has(X86_FEATURE_RDTSCP)) + write_rdtscp_aux(cpudata); + + /* Store CPU and node number in limit. */ + d.limit0 = cpudata; + d.limit1 = cpudata >> 16; + + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, DESCTYPE_S); +} +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1706,6 +1729,7 @@ void cpu_init(void) early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif + setup_getcpu(cpu); me = current; From 22245bdf0ad805d6c29f82b6d5e977ee94bb2166 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Oct 2018 10:41:59 +0200 Subject: [PATCH 11/12] x86/segments: Introduce the 'CPUNODE' naming to better document the segment limit CPU/node NR trick We have a special segment descriptor entry in the GDT, whose sole purpose is to encode the CPU and node numbers in its limit (size) field. There are user-space instructions that allow the reading of the limit field, which gives us a really fast way to read the CPU and node IDs from the vDSO for example. But the naming of related functionality does not make this clear, at all: VDSO_CPU_SIZE VDSO_CPU_MASK __CPU_NUMBER_SEG GDT_ENTRY_CPU_NUMBER vdso_encode_cpu_node vdso_read_cpu_node There's a number of problems: - The 'VDSO_CPU_SIZE' doesn't really make it clear that these are number of bits, nor does it make it clear which 'CPU' this refers to, i.e. that this is about a GDT entry whose limit encodes the CPU and node number. - Furthermore, the 'CPU_NUMBER' naming is actively misleading as well, because the segment limit encodes not just the CPU number but the node ID as well ... So use a better nomenclature all around: name everything related to this trick as 'CPUNODE', to make it clear that this is something special, and add _BITS to make it clear that these are number of bits, and propagate this to every affected name: VDSO_CPU_SIZE => VDSO_CPUNODE_BITS VDSO_CPU_MASK => VDSO_CPUNODE_MASK __CPU_NUMBER_SEG => __CPUNODE_SEG GDT_ENTRY_CPU_NUMBER => GDT_ENTRY_CPUNODE vdso_encode_cpu_node => vdso_encode_cpunode vdso_read_cpu_node => vdso_read_cpunode This, beyond being less confusing, also makes it easier to grep for all related functionality: $ git grep -i cpunode arch/x86 Also, while at it, fix "return is not a function" style sloppiness in vdso_encode_cpunode(). Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chang S. Bae Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1537312139-5580-2-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vgetcpu.c | 2 +- arch/x86/include/asm/segment.h | 22 +++++++++++----------- arch/x86/kernel/cpu/common.c | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index de78fc9cd963..edd214f5264d 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -13,7 +13,7 @@ notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - vdso_read_cpu_node(cpu, node); + vdso_read_cpunode(cpu, node); return 0; } diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 4d1f6cc62e13..a314087add07 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -186,7 +186,7 @@ #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 -#define GDT_ENTRY_CPU_NUMBER 15 +#define GDT_ENTRY_CPUNODE 15 /* * Number of entries in the GDT table: @@ -206,7 +206,7 @@ #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER32_DS __USER_DS #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) -#define __CPU_NUMBER_SEG (GDT_ENTRY_CPU_NUMBER*8 + 3) +#define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #endif @@ -227,24 +227,24 @@ #ifdef CONFIG_X86_64 /* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ -#define VDSO_CPU_SIZE 12 -#define VDSO_CPU_MASK 0xfff +#define VDSO_CPUNODE_BITS 12 +#define VDSO_CPUNODE_MASK 0xfff #ifndef __ASSEMBLY__ /* Helper functions to store/load CPU and node numbers */ -static inline unsigned long vdso_encode_cpu_node(int cpu, unsigned long node) +static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) { - return ((node << VDSO_CPU_SIZE) | cpu); + return (node << VDSO_CPUNODE_BITS) | cpu; } -static inline void vdso_read_cpu_node(unsigned *cpu, unsigned *node) +static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { unsigned int p; /* - * Load CPU and node number from GDT. LSL is faster than RDTSCP + * Load CPU and node number from the GDT. LSL is faster than RDTSCP * and works on all CPUs. This is volatile so that it orders * correctly with respect to barrier() and to keep GCC from cleverly * hoisting it out of the calling function. @@ -254,12 +254,12 @@ static inline void vdso_read_cpu_node(unsigned *cpu, unsigned *node) alternative_io ("lsl %[seg],%[p]", ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ X86_FEATURE_RDPID, - [p] "=a" (p), [seg] "r" (__CPU_NUMBER_SEG)); + [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) - *cpu = (p & VDSO_CPU_MASK); + *cpu = (p & VDSO_CPUNODE_MASK); if (node) - *node = (p >> VDSO_CPU_SIZE); + *node = (p >> VDSO_CPUNODE_BITS); } #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a148d18a1ef0..7da587f4af52 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1672,7 +1672,7 @@ static void wait_for_master_cpu(int cpu) #ifdef CONFIG_X86_64 static void setup_getcpu(int cpu) { - unsigned long cpudata = vdso_encode_cpu_node(cpu, early_cpu_to_node(cpu)); + unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; if (static_cpu_has(X86_FEATURE_RDTSCP)) @@ -1688,7 +1688,7 @@ static void setup_getcpu(int cpu) d.p = 1; /* Present */ d.d = 1; /* 32-bit */ - write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPUNODE, &d, DESCTYPE_S); } #endif From ec3a94188df7d28b374868d9a2a0face910e62ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Oct 2018 10:41:59 +0200 Subject: [PATCH 12/12] x86/fsgsbase/64: Clean up various details So: - use 'extern' consistently for APIs - fix weird header guard - clarify code comments - reorder APIs by type Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Chang S. Bae Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Markus T Metzger Cc: Peter Zijlstra Cc: Ravi Shankar Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1537312139-5580-2-git-send-email-chang.seok.bae@intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vgetcpu.c | 1 + arch/x86/include/asm/fsgsbase.h | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c index edd214f5264d..f86ab0ae1777 100644 --- a/arch/x86/entry/vdso/vgetcpu.c +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -14,6 +14,7 @@ notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { vdso_read_cpunode(cpu, node); + return 0; } diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index 5e9cbcce318a..eb377b6e9eed 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_FSGSBASE_H -#define _ASM_FSGSBASE_H 1 +#define _ASM_FSGSBASE_H #ifndef __ASSEMBLY__ @@ -9,14 +9,15 @@ #include /* - * Read/write a task's fsbase or gsbase. This returns the value that + * Read/write a task's FSBASE or GSBASE. This returns the value that * the FS/GS base would have (if the task were to be resumed). These - * work on current or on a different non-running task. + * work on the current task or on a non-running (typically stopped + * ptrace child) task. */ -unsigned long x86_fsbase_read_task(struct task_struct *task); -unsigned long x86_gsbase_read_task(struct task_struct *task); -int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); -int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); +extern unsigned long x86_fsbase_read_task(struct task_struct *task); +extern unsigned long x86_gsbase_read_task(struct task_struct *task); +extern int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); +extern int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); /* Helper functions for reading/writing FS/GS base */ @@ -25,20 +26,21 @@ static inline unsigned long x86_fsbase_read_cpu(void) unsigned long fsbase; rdmsrl(MSR_FS_BASE, fsbase); + return fsbase; } -void x86_fsbase_write_cpu(unsigned long fsbase); - static inline unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + return gsbase; } -void x86_gsbase_write_cpu_inactive(unsigned long gsbase); +extern void x86_fsbase_write_cpu(unsigned long fsbase); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); #endif /* CONFIG_X86_64 */