From 4b8fa1173cdc33a1cb53e4cac869b1b7aac917a9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Jun 2024 12:21:44 -0700
Subject: x86-64: word-at-a-time: improve byte count calculations

This switches x86-64 over to using 'tzcount' instead of the integer
multiply trick to turn the bytemask information into actual byte counts.

We even had a comment saying that a fast bit count instruction is better
than a multiply, but x86 bit counting has traditionally been
"questionably fast", and so avoiding it was the right thing back in the
days.

Now, on any half-way modern core, using bit counting is cheaper and
smaller than the large constant multiply, so let's just switch over.

Note that as part of switching over to counting bits, we also do it at a
different point.  We used to create the byte count from the final byte
mask, but once you use the 'tzcount' instruction (aka 'bsf' on older
CPU's), you can actually count the leading zeroes using a value we have
available earlier.

In fact, we can just use the very first mask of bits that tells us
whether we have any zero bytes at all.  The zero bytes in the word will
have the high bit set, so just doing 'tzcount' on that value and
dividing by 8 will give the number of bytes that precede the first NUL
character, which is exactly what we want.

Note also that the input value to the tzcount is by definition not zero,
since that is the condition that we already used to check the whole "do
we have any zero bytes at all".  So we don't need to worry about the
legacy instruction behavior of pre-lzcount days when 'bsf' didn't have a
result for zero input.

The 32-bit code continues to use the bimple bit op trick that is faster
even on newer cores, but particularly on the older 32-bit-only ones.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/word-at-a-time.h | 57 ++++++++++++++---------------------
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/word-at-a-time.h b/arch/x86/include/asm/word-at-a-time.h
index e8d7d4941c4c..422a47746657 100644
--- a/arch/x86/include/asm/word-at-a-time.h
+++ b/arch/x86/include/asm/word-at-a-time.h
@@ -5,45 +5,12 @@
 #include <linux/bitops.h>
 #include <linux/wordpart.h>
 
-/*
- * This is largely generic for little-endian machines, but the
- * optimal byte mask counting is probably going to be something
- * that is architecture-specific. If you have a reliably fast
- * bit count instruction, that might be better than the multiply
- * and shift, for example.
- */
 struct word_at_a_time {
 	const unsigned long one_bits, high_bits;
 };
 
 #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }
 
-#ifdef CONFIG_64BIT
-
-/*
- * Jan Achrenius on G+: microoptimized version of
- * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
- * that works for the bytemasks without having to
- * mask them first.
- */
-static inline long count_masked_bytes(unsigned long mask)
-{
-	return mask*0x0001020304050608ul >> 56;
-}
-
-#else	/* 32-bit case */
-
-/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
-static inline long count_masked_bytes(long mask)
-{
-	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
-	long a = (0x0ff0001+mask) >> 23;
-	/* Fix the 1 for 00 case */
-	return a & mask;
-}
-
-#endif
-
 /* Return nonzero if it has a zero */
 static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c)
 {
@@ -57,6 +24,22 @@ static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits,
 	return bits;
 }
 
+#ifdef CONFIG_64BIT
+
+/* Keep the initial has_zero() value for both bitmask and size calc */
+#define create_zero_mask(bits) (bits)
+
+static inline unsigned long zero_bytemask(unsigned long bits)
+{
+	bits = (bits - 1) & ~bits;
+	return bits >> 7;
+}
+
+#define find_zero(bits) (__ffs(bits) >> 3)
+
+#else
+
+/* Create the final mask for both bytemask and size */
 static inline unsigned long create_zero_mask(unsigned long bits)
 {
 	bits = (bits - 1) & ~bits;
@@ -66,11 +49,17 @@ static inline unsigned long create_zero_mask(unsigned long bits)
 /* The mask we created is directly usable as a bytemask */
 #define zero_bytemask(mask) (mask)
 
+/* Carl Chatfield / Jan Achrenius G+ version for 32-bit */
 static inline unsigned long find_zero(unsigned long mask)
 {
-	return count_masked_bytes(mask);
+	/* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */
+	long a = (0x0ff0001+mask) >> 23;
+	/* Fix the 1 for 00 case */
+	return a & mask;
 }
 
+#endif
+
 /*
  * Load an unaligned word from kernel space.
  *
-- 
cgit v1.2.3


From f915a3e5b0182dd7376f11337e231500a157e1f4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 18 Jun 2024 18:14:48 -0700
Subject: arm64: word-at-a-time: improve byte count calculations for LE

Do the same optimization as x86-64: do __ffs() on the intermediate value
that found whether there is a zero byte, before we've actually computed
the final byte mask.

The logic is:

 has_zero():
	Check if the word has a zero byte in it, which indicates the end
	of the loop, and prepare a value to be used for the rest of the
	sequence.

	The standard LE implementation just creates a word that has the
	high bit set in each byte of the word that was zero.

        Example: 0xaa00bbccdd00eeff -> 0x0080000000800000

 prep_zero_mask():
	Possibly do more prep to then clean up the initial fast result
	from has_zero, so that it can be combined with another zero mask
	with a simple logical "or" to create a final mask.

	This is only used on big-endian machines that use a different
	algorithm, and is a no-op here.

 create_zero_mask():
	This is "step 1" of creating the count and the mask, and is
	meant for any common operations between the two.

	In the old implementation, this actually created the zero mask,
	that was then used for masking and for counting the number of
	bits in the mask.

	In the new implementation, this is a no-op.

 count_zero():
	This takes the mask bits, and counts the number of bytes before
	the first zero byte.

	In the old implementation, it counted the number of bits in the
	final byte mask (which was the same as the C standard "find last
	set bit" that uses the silly "starts at one" counting) and shifted
	the value down by three.

	In the new implementation, we know the intermediate mask isn't
	zero, and it just does "find first set" with the sane semantics
	without any off-by-one issues, and again shifts by three (which
	also masks off the bit offset in the zero byte itself).

	Example: 0x0080000000800000 -> 2

 zero_bytemask():
	This takes the mask bits, and turns it into an actual byte mask
	of the bytes preceding the first zero byte.

	In the old implementation, this was a no-op, because the work
	had already been done by create_zero_mask().

	In the new implementation, this does what create_zero_mask()
	used to do.

	Example: 0x0080000000800000 -> 0x000000000000ffff

The difference between the old and the new implementation is that
"count_zero()" ends up scheduling better because it is being done on a
value that is available earlier (before the final mask).

But more importantly, it can be implemented without the insane semantics
of the standard bit finding helpers that have the off-by-one issue and
have to special-case the zero mask situation.

On arm64, the new "count_zero()" ends up just "rbit + clz" plus the
shift right that then ends up being subsumed by the "add to final
length".

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/word-at-a-time.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/word-at-a-time.h b/arch/arm64/include/asm/word-at-a-time.h
index 14251abee23c..824ca6987a51 100644
--- a/arch/arm64/include/asm/word-at-a-time.h
+++ b/arch/arm64/include/asm/word-at-a-time.h
@@ -27,20 +27,15 @@ static inline unsigned long has_zero(unsigned long a, unsigned long *bits,
 }
 
 #define prep_zero_mask(a, bits, c) (bits)
+#define create_zero_mask(bits) (bits)
+#define find_zero(bits) (__ffs(bits) >> 3)
 
-static inline unsigned long create_zero_mask(unsigned long bits)
+static inline unsigned long zero_bytemask(unsigned long bits)
 {
 	bits = (bits - 1) & ~bits;
 	return bits >> 7;
 }
 
-static inline unsigned long find_zero(unsigned long mask)
-{
-	return fls64(mask) >> 3;
-}
-
-#define zero_bytemask(mask) (mask)
-
 #else	/* __AARCH64EB__ */
 #include <asm-generic/word-at-a-time.h>
 #endif
-- 
cgit v1.2.3