x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <[email protected]> Acked-by: H. Peter Anvin <[email protected]> Cc: Linus Torvalds <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Ingo Molnar <[email protected]>

commit: f317820cb6ee3fb173319bf76e0e62437be78ad2 [log] [tgz]
author: Jan Beulich <[email protected]> Fri Nov 02 14:20:24 2012 +0000
committer: Ingo Molnar <[email protected]> Fri Jan 25 09:23:50 2013 +0100
tree: fc57358da4ba9f11a8d80e508d01e99c2c62c1f9
parent: e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 [diff] [blame]
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index b85dc87..ce05722 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h

@@ -543,26 +543,25 @@
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
 
-#undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
-do {							\
-	xor_speed(&xor_block_8regs);			\
-	xor_speed(&xor_block_8regs_p);			\
-	xor_speed(&xor_block_32regs);			\
-	xor_speed(&xor_block_32regs_p);			\
-	AVX_XOR_SPEED;					\
-	if (cpu_has_xmm)				\
-		xor_speed(&xor_block_pIII_sse);		\
-	if (cpu_has_mmx) {				\
-		xor_speed(&xor_block_pII_mmx);		\
-		xor_speed(&xor_block_p5_mmx);		\
-	}						\
-} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST)			\
-	AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#undef XOR_TRY_TEMPLATES
+#define XOR_TRY_TEMPLATES				\
+do {							\
+	AVX_XOR_SPEED;					\
+	if (cpu_has_xmm) {				\
+		xor_speed(&xor_block_pIII_sse);		\
+		xor_speed(&xor_block_sse_pf64);		\
+	} else if (cpu_has_mmx) {			\
+		xor_speed(&xor_block_pII_mmx);		\
+		xor_speed(&xor_block_p5_mmx);		\
+	} else {					\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	}						\
+} while (0)
 
 #endif /* _ASM_X86_XOR_32_H */
commit	f317820cb6ee3fb173319bf76e0e62437be78ad2	[log] [tgz]
author	Jan Beulich <[email protected]>	Fri Nov 02 14:20:24 2012 +0000
committer	Ingo Molnar <[email protected]>	Fri Jan 25 09:23:50 2013 +0100
tree	fc57358da4ba9f11a8d80e508d01e99c2c62c1f9
parent	e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 [diff] [blame]