From 2f74759056797054122cdc70844137f70bb3f626 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 7 Jun 2012 22:20:18 +0900 Subject: x86/alternatives: Use atomic_xchg() instead atomic_dec_and_test() for stop_machine_text_poke() stop_machine_text_poke() uses atomic_dec_and_test() to select one of the CPUs executing that function to actually modify the code. Since the variable is initialized to 1, subsequent CPUs will make the variable go negative. Since going negative is uncommon/unexpected in typical dec_and_test usage change this user to atomic_xchg(). This was found using a patch that warns on dec_and_test going negative. Signed-off-by: OGAWA Hirofumi Acked-by: Steven Rostedt [ Rewrote changelog ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/87zk8fgsx9.fsf@devron.myhome.or.jp Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f0759..53231a045d3d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -664,7 +664,7 @@ static int __kprobes stop_machine_text_poke(void *data) struct text_poke_param *p; int i; - if (atomic_dec_and_test(&stop_machine_first)) { + if (atomic_xchg(&stop_machine_first, 0)) { for (i = 0; i < tpp->nparams; i++) { p = &tpp->params[i]; text_poke(p->addr, p->opcode, p->len); -- cgit v1.2.3 From 954e482bde20b0e208fd4d34ef26e10afd194600 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 24 May 2012 18:19:45 -0700 Subject: x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature According to Intel 64 and IA-32 SDM and Optimization Reference Manual, beginning with Ivybridge, REG string operation using MOVSB and STOSB can provide both flexible and high-performance REG string operations in cases like memory copy. Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/ STOSB). If CPU erms feature is detected, patch copy_user_generic with enhanced fast string version of copy_user_generic. A few new macros are defined to reduce duplicate code in ALTERNATIVE and ALTERNATIVE_2. Signed-off-by: Fenghua Yu Link: http://lkml.kernel.org/r/1337908785-14015-1-git-send-email-fenghua.yu@intel.com Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 74 ++++++++++++++++++++++++++++++-------- arch/x86/include/asm/uaccess_64.h | 11 +++++- arch/x86/kernel/x8664_ksyms_64.c | 1 + 3 files changed, 70 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 49331bedc158..70780689599a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number) "663"#number +#define e_replacement(number) "664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_slen "\n" /* source len */ \ + " .byte " alt_rlen(number) "\n" /* replacement len */ + +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" + /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - \ - "661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .long 661b - .\n" /* label */ \ - " .long 663f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ - " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".previous" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + DISCARD_ENTRY(2) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ".previous" /* * This must be included *after* the definition of ALTERNATIVE due to @@ -139,6 +170,19 @@ static inline int alternatives_text_reserved(void *start, void *end) asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) +/* + * Like alternative_call, but there are two features and respective functions. + * If CPU has feature2, function2 is used. + * Otherwise, if CPU has feature1, function1 is used. + * Otherwise, old function is used. + */ +#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ + output, input...) \ + asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ + "call %P[new2]", feature2) \ + : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8e796fbbf9c6..d8def8b3dba0 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -17,6 +17,8 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long +copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); +__must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); @@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - alternative_call(copy_user_generic_unrolled, + /* + * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. + * Otherwise, use copy_user_generic_unrolled. + */ + alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, + copy_user_enhanced_fast_string, + X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f3d074..6020f6f5927c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic_string); EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -- cgit v1.2.3 From 1ba9a294141b106b7247649a5c3372d8284eca80 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 6 Jul 2012 15:07:48 +0100 Subject: x86/mm/mtrr: Fix alignment determination in range_to_mtrr() With the variable operated on being of "unsigned long" type, neither ffs() nor fls() are suitable to use on them, as those truncate their arguments to 32 bits. Using __ffs() and __fls() respectively at once eliminates the need to subtract 1 from their results. Additionally, with the alignment value subsequently used as a shift count, it must be enforced to be less than BITS_PER_LONG (and on 64-bit there's no need for it to be any smaller). Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4FF70D54020000780008E179@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index bdda2e6c673b..35ffda5d0727 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, /* Compute the maximum size with which we can make a range: */ if (range_startk) - max_align = ffs(range_startk) - 1; + max_align = __ffs(range_startk); else - max_align = 32; + max_align = BITS_PER_LONG - 1; - align = fls(range_sizek) - 1; + align = __fls(range_sizek); if (align > max_align) align = max_align; -- cgit v1.2.3 From a7101d152665817bf7cafc47e7f5dcb1f54664fe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 6 Jul 2012 15:20:35 +0100 Subject: x86/mm/mtrr: Slightly simplify print_mtrr_state() high_width can be easily calculated in a single expression when making use of __ffs64(). Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Yinghai Lu Link: http://lkml.kernel.org/r/4FF71053020000780008E1B5@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae6c65f..e9fe907cd249 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -361,11 +361,7 @@ static void __init print_mtrr_state(void) } pr_debug("MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - if (size_or_mask & 0xffffffffUL) - high_width = ffs(size_or_mask & 0xffffffffUL) - 1; - else - high_width = ffs(size_or_mask>>32) + 32 - 1; - high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) -- cgit v1.2.3