Commit 8f966064 authored by Oleksandr Natalenko's avatar Oleksandr Natalenko

Merge branch 'fsgsbase-5.6' into pf-5.6

parents 83da834f e5799b8b
......@@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors
that absolutely need the more expensive check for the GS base - and we
generate all 'normal' entry points with the regular (faster) paranoid=0
variant.
On FSGSBASE systems, however, user space can set GS without kernel
interaction. It means the value of GS base itself does not imply anything,
whether a kernel value or a user space value. So, there is no longer a safe
way to check whether the exception is entering from user mode or kernel
mode in the paranoid entry code path. So the GS base value needs to be read
out, saved and the kernel GS base value written. On exit, the saved GS base
value needs to be restored unconditionally. The non-paranoid entry/exit
code still uses SWAPGS unconditionally as the state is known.
......@@ -368,7 +368,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
/*
* Fetch the per-CPU GS base value for this processor and put it in @reg.
* Fetch the per-CPU GSBASE value for this processor and put it in @reg.
* We normally use %gs for accessing per-CPU data, but we are setting up
* %gs here and obviously can not use %gs itself to access per-CPU data.
*/
......
......@@ -922,7 +922,6 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
.endif
.if \paranoid
/* this procedure expect "no swapgs" flag in ebx */
jmp paranoid_exit
.else
jmp error_exit
......@@ -1212,14 +1211,14 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
/*
* Save all registers in pt_regs. Return GS base related information
* Save all registers in pt_regs. Return GSBASE related information
* in EBX depending on the availability of the FSGSBASE instructions:
*
* FSGSBASE R/EBX
* N 0 -> SWAPGS on exit
* 1 -> no SWAPGS on exit
*
* Y GS base value at entry, must be restored in paranoid_exit
* Y GSBASE value at entry, must be restored in paranoid_exit
*/
SYM_CODE_START_LOCAL(paranoid_entry)
UNWIND_HINT_FUNC
......@@ -1237,39 +1236,39 @@ SYM_CODE_START_LOCAL(paranoid_entry)
* hardware at entry) can not be used: this may be a return
* to kernel code, but with a user CR3 value.
*
* Switching CR3 does not depend on kernel GS base so it can
* be done before switching to the kernel GS base. This is
* required for FSGSBASE because the kernel GS base has to
* Switching CR3 does not depend on kernel GSBASE so it can
* be done before switching to the kernel GSBASE. This is
* required for FSGSBASE because the kernel GSBASE has to
* be retrieved from a kernel internal table.
*/
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
/*
* Handling GS base depends on the availability of FSGSBASE.
* Handling GSBASE depends on the availability of FSGSBASE.
*
* Without FSGSBASE the kernel enforces that negative GS base
* values indicate kernel GS base. With FSGSBASE no assumptions
* can be made about the GS base value when entering from user
* Without FSGSBASE the kernel enforces that negative GSBASE
* values indicate kernel GSBASE. With FSGSBASE no assumptions
* can be made about the GSBASE value when entering from user
* space.
*/
*/
ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
/*
* Read the current GS base and store it in %rbx unconditionally,
* retrieve and set the current CPUs kernel GS base. The stored value
* Read the current GSBASE and store it in %rbx unconditionally,
* retrieve and set the current CPUs kernel GSBASE. The stored value
* has to be restored in paranoid_exit unconditionally.
*
* This unconditional write of GS base ensures no subsequent load
* based on a mispredicted GS base.
* The MSR write ensures that no subsequent load is based on a
* mispredicted GSBASE. No extra FENCE required.
*/
SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
ret
.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GS base active, no restore required */
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl $1, %ebx
/*
* The kernel-enforced convention is a negative GS base indicates
* The kernel-enforced convention is a negative GSBASE indicates
* a kernel value. No SWAPGS needed on entry and exit.
*/
movl $MSR_GS_BASE, %ecx
......@@ -1280,7 +1279,14 @@ SYM_CODE_START_LOCAL(paranoid_entry)
.Lparanoid_entry_swapgs:
SWAPGS
/*
* The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an
* unconditional CR3 write, even in the PTI case. So do an lfence
* to prevent GS speculation, regardless of whether PTI is enabled.
*/
FENCE_SWAPGS_KERNEL_ENTRY
/* EBX = 0 -> SWAPGS required on exit */
xorl %ebx, %ebx
ret
......@@ -1296,45 +1302,45 @@ SYM_CODE_END(paranoid_entry)
* be complicated. Fortunately, there's no good reason to try
* to handle preemption here.
*
* R/EBX contains the GS base related information depending on the
* R/EBX contains the GSBASE related information depending on the
* availability of the FSGSBASE instructions:
*
* FSGSBASE R/EBX
* N 0 -> SWAPGS on exit
* 1 -> no SWAPGS on exit
*
* Y User space GS base, must be restored unconditionally
* Y User space GSBASE, must be restored unconditionally
*/
SYM_CODE_START_LOCAL(paranoid_exit)
UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
/*
* The order of operations is important. IRQ tracing requires
* kernel GS base and CR3. RESTORE_CR3 requires kernel GS base.
* kernel GSBASE and CR3. RESTORE_CR3 requires kernel GSBASE.
*
* NB to anyone to try to optimize this code: this code does
* not execute at all for exceptions from user mode. Those
* exceptions go through error_exit instead.
*/
TRACE_IRQS_OFF_DEBUG
TRACE_IRQS_IRETQ_DEBUG
RESTORE_CR3 scratch_reg=%rax save_reg=%r14
/* Handle the three GS base cases */
/* Handle the three GSBASE cases */
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE
/* With FSGSBASE enabled, unconditionally resotre GS base */
/* With FSGSBASE enabled, unconditionally restore GSBASE */
wrgsbase %rbx
jmp restore_regs_and_return_to_kernel
jmp restore_regs_and_return_to_kernel
.Lparanoid_exit_checkgs:
/* On non-FSGSBASE systems, conditionally do SWAPGS */
testl %ebx, %ebx
jnz restore_regs_and_return_to_kernel
testl %ebx, %ebx
jnz restore_regs_and_return_to_kernel
/* We are returning to a context with user GS base */
/* We are returning to a context with user GSBASE */
SWAPGS_UNSAFE_STACK
jmp restore_regs_and_return_to_kernel
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(paranoid_exit)
/*
......@@ -1743,11 +1749,11 @@ end_repeat_nmi:
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
/*
* The above invocation of paranoid_entry stored the GS base
* The above invocation of paranoid_entry stored the GSBASE
* related information in R/EBX depending on the availability
* of FSGSBASE.
*
* If FSGSBASE is enabled, restore the saved GS base value
* If FSGSBASE is enabled, restore the saved GSBASE value
* unconditionally, otherwise take the conditional SWAPGS path.
*/
ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
......
......@@ -455,10 +455,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void);
#if IS_ENABLED(CONFIG_KVM)
/* Save actual FS/GS selectors and bases to current->thread */
void save_fsgs_for_kvm(void);
#endif
void current_save_fsgs(void);
#else /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
/*
......
......@@ -141,11 +141,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
#ifdef CONFIG_X86_64
save_fsgs(current);
current_save_fsgs();
p->thread.fsindex = current->thread.fsindex;
p->thread.fsbase = current->thread.fsbase;
p->thread.gsindex = current->thread.gsindex;
p->thread.gsbase = current->thread.gsbase;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
#else
......
......@@ -37,75 +37,3 @@ static inline void switch_to_extra(struct task_struct *prev,
prev_tif & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev, next);
}
#ifdef CONFIG_X86_64
enum which_selector {
FS,
GS
};
/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
* is hot.
*/
static __always_inline void save_base_legacy(struct task_struct *prev_p,
unsigned short selector,
enum which_selector which)
{
if (likely(selector == 0)) {
/*
* On Intel (without X86_BUG_NULL_SEG), the segment base could
* be the pre-existing saved base or it could be zero. On AMD
* (with X86_BUG_NULL_SEG), the segment base could be almost
* anything.
*
* This branch is very hot (it's hit twice on almost every
* context switch between 64-bit programs), and avoiding
* the RDMSR helps a lot, so we just assume that whatever
* value is already saved is correct. This matches historical
* Linux behavior, so it won't break existing applications.
*
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
* report that the base is zero, it needs to actually be zero:
* see the corresponding logic in load_seg_legacy.
*/
} else {
/*
* If the selector is 1, 2, or 3, then the base is zero on
* !X86_BUG_NULL_SEG CPUs and could be anything on
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
* has never attempted to preserve the base across context
* switches.
*
* If selector > 3, then it refers to a real segment, and
* saving the base isn't necessary.
*/
if (which == FS)
prev_p->thread.fsbase = 0;
else
prev_p->thread.gsbase = 0;
}
}
static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
/*
* If FSGSBASE is enabled, we can't make any useful guesses
* about the base, and user code expects us to save the current
* value. Fortunately, reading the base directly is efficient.
*/
task->thread.fsbase = rdfsbase();
task->thread.gsbase = x86_gsbase_read_cpu_inactive();
} else {
save_base_legacy(task, task->thread.fsindex, FS);
save_base_legacy(task, task->thread.gsindex, GS);
}
}
#endif
......@@ -146,18 +146,123 @@ void release_thread(struct task_struct *dead_task)
WARN_ON(dead_task->mm);
}
#if IS_ENABLED(CONFIG_KVM)
enum which_selector {
FS,
GS
};
/*
* Out of line to be protected from kprobes. It is not used on Xen
* paravirt. When paravirt support is needed, it needs to be renamed
* with native_ prefix.
*/
static noinline unsigned long __rdgsbase_inactive(void)
{
unsigned long gsbase;
lockdep_assert_irqs_disabled();
native_swapgs();
gsbase = rdgsbase();
native_swapgs();
return gsbase;
}
NOKPROBE_SYMBOL(__rdgsbase_inactive);
/*
* Out of line to be protected from kprobes. It is not used on Xen
* paravirt. When paravirt support is needed, it needs to be renamed
* with native_ prefix.
*/
static noinline void __wrgsbase_inactive(unsigned long gsbase)
{
lockdep_assert_irqs_disabled();
native_swapgs();
wrgsbase(gsbase);
native_swapgs();
}
NOKPROBE_SYMBOL(__wrgsbase_inactive);
/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
* is hot.
*/
static __always_inline void save_base_legacy(struct task_struct *prev_p,
unsigned short selector,
enum which_selector which)
{
if (likely(selector == 0)) {
/*
* On Intel (without X86_BUG_NULL_SEG), the segment base could
* be the pre-existing saved base or it could be zero. On AMD
* (with X86_BUG_NULL_SEG), the segment base could be almost
* anything.
*
* This branch is very hot (it's hit twice on almost every
* context switch between 64-bit programs), and avoiding
* the RDMSR helps a lot, so we just assume that whatever
* value is already saved is correct. This matches historical
* Linux behavior, so it won't break existing applications.
*
* To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
* report that the base is zero, it needs to actually be zero:
* see the corresponding logic in load_seg_legacy.
*/
} else {
/*
* If the selector is 1, 2, or 3, then the base is zero on
* !X86_BUG_NULL_SEG CPUs and could be anything on
* X86_BUG_NULL_SEG CPUs. In the latter case, Linux
* has never attempted to preserve the base across context
* switches.
*
* If selector > 3, then it refers to a real segment, and
* saving the base isn't necessary.
*/
if (which == FS)
prev_p->thread.fsbase = 0;
else
prev_p->thread.gsbase = 0;
}
}
static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
/*
* If FSGSBASE is enabled, we can't make any useful guesses
* about the base, and user code expects us to save the current
* value. Fortunately, reading the base directly is efficient.
*/
task->thread.fsbase = rdfsbase();
task->thread.gsbase = __rdgsbase_inactive();
} else {
save_base_legacy(task, task->thread.fsindex, FS);
save_base_legacy(task, task->thread.gsindex, GS);
}
}
/*
* While a process is running,current->thread.fsbase and current->thread.gsbase
* may not match the corresponding CPU registers (see save_base_legacy()). KVM
* wants an efficient way to save and restore FSBASE and GSBASE.
* When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
* may not match the corresponding CPU registers (see save_base_legacy()).
*/
void save_fsgs_for_kvm(void)
void current_save_fsgs(void)
{
unsigned long flags;
/* Interrupts need to be off for FSGSBASE */
local_irq_save(flags);
save_fsgs(current);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
#if IS_ENABLED(CONFIG_KVM)
EXPORT_SYMBOL_GPL(current_save_fsgs);
#endif
static __always_inline void loadseg(enum which_selector which,
......@@ -231,7 +336,7 @@ static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
/* Update the bases. */
wrfsbase(next->fsbase);
x86_gsbase_write_cpu_inactive(next->gsbase);
__wrgsbase_inactive(next->gsbase);
} else {
load_seg_legacy(prev->fsindex, prev->fsbase,
next->fsindex, next->fsbase, FS);
......@@ -288,25 +393,11 @@ unsigned long x86_gsbase_read_cpu_inactive(void)
unsigned long gsbase;
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
bool need_restore = false;
unsigned long flags;
/*
* We read the inactive GS base value by swapping
* to make it the active one. But we cannot allow
* an interrupt while we switch to and from.
*/
if (!irqs_disabled()) {
local_irq_save(flags);
need_restore = true;
}
native_swapgs();
gsbase = rdgsbase();
native_swapgs();
if (need_restore)
local_irq_restore(flags);
local_irq_save(flags);
gsbase = __rdgsbase_inactive();
local_irq_restore(flags);
} else {
rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
}
......@@ -317,25 +408,11 @@ unsigned long x86_gsbase_read_cpu_inactive(void)
void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
{
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
bool need_restore = false;
unsigned long flags;
/*
* We write the inactive GS base value by swapping
* to make it the active one. But we cannot allow
* an interrupt while we switch to and from.
*/
if (!irqs_disabled()) {
local_irq_save(flags);
need_restore = true;
}
native_swapgs();
wrgsbase(gsbase);
native_swapgs();
if (need_restore)
local_irq_restore(flags);
local_irq_save(flags);
__wrgsbase_inactive(gsbase);
local_irq_restore(flags);
} else {
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
}
......
......@@ -1175,7 +1175,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
save_fsgs_for_kvm();
current_save_fsgs();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment