summaryrefslogtreecommitdiff
path: root/lib/raid/raid6
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2026-05-18 07:17:46 +0200
committerAndrew Morton <akpm@linux-foundation.org>2026-05-28 21:24:53 -0700
commit3626738bc7147d52cb49f3994a9846aa2d34810a (patch)
tree683cf2b4f8028ae1d915dcd5a549047d0971e311 /lib/raid/raid6
parent3d6beb659ddf0664612bafacb0bd9030ba6ec7e6 (diff)
downloadlwn-3626738bc7147d52cb49f3994a9846aa2d34810a.tar.gz
lwn-3626738bc7147d52cb49f3994a9846aa2d34810a.zip
raid6: move to lib/raid/
Move the raid6 code to live in lib/raid/ with the XOR code, and change the internal organization so that each architecture has a subdirectory similar to the CRC, crypto and XOR libraries, and fix up the Makefile to only build files actually needed. Also move the kunit test case from the history test/ subdirectory to tests/ and use the normal naming scheme for it. Link: https://lore.kernel.org/20260518051804.462141-4-hch@lst.de Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Ard Biesheuvel <ardb@kernel.org> # kunit only on arm64 Cc: Albert Ou <aou@eecs.berkeley.edu> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Alexandre Ghiti <alex@ghiti.fr> Cc: Arnd Bergmann <arnd@arndb.de> Cc: "Borislav Petkov (AMD)" <bp@alien8.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Mason <clm@fb.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Sterba <dsterba@suse.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Nan <linan122@huawei.com> Cc: Madhavan Srinivasan <maddy@linux.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Palmer Dabbelt <palmer@dabbelt.com> Cc: Song Liu <song@kernel.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: WANG Xuerui <kernel@xen0n.name> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'lib/raid/raid6')
-rw-r--r--lib/raid/raid6/.gitignore8
-rw-r--r--lib/raid/raid6/Makefile122
-rw-r--r--lib/raid/raid6/algos.c282
-rw-r--r--lib/raid/raid6/arm/neon.c58
-rw-r--r--lib/raid/raid6/arm/neon.h22
-rw-r--r--lib/raid/raid6/arm/neon.uc153
-rw-r--r--lib/raid/raid6/arm/recov_neon.c91
-rw-r--r--lib/raid/raid6/arm/recov_neon_inner.c111
-rw-r--r--lib/raid/raid6/int.uc147
-rw-r--r--lib/raid/raid6/loongarch/loongarch_simd.c423
-rw-r--r--lib/raid/raid6/loongarch/recov_loongarch_simd.c514
-rw-r--r--lib/raid/raid6/mktables.c163
-rw-r--r--lib/raid/raid6/powerpc/altivec.uc122
-rw-r--r--lib/raid/raid6/powerpc/vpermxor.uc95
-rw-r--r--lib/raid/raid6/recov.c101
-rw-r--r--lib/raid/raid6/riscv/recov_rvv.c222
-rw-r--r--lib/raid/raid6/riscv/rvv.c1228
-rw-r--r--lib/raid/raid6/riscv/rvv.h47
-rw-r--r--lib/raid/raid6/s390/recov_s390xc.c116
-rw-r--r--lib/raid/raid6/s390/s390vx.uc135
-rw-r--r--lib/raid/raid6/tests/Makefile3
-rw-r--r--lib/raid/raid6/tests/raid6_kunit.c160
-rw-r--r--lib/raid/raid6/unroll.awk20
-rw-r--r--lib/raid/raid6/x86/avx2.c470
-rw-r--r--lib/raid/raid6/x86/avx512.c560
-rw-r--r--lib/raid/raid6/x86/mmx.c135
-rw-r--r--lib/raid/raid6/x86/recov_avx2.c313
-rw-r--r--lib/raid/raid6/x86/recov_avx512.c377
-rw-r--r--lib/raid/raid6/x86/recov_ssse3.c328
-rw-r--r--lib/raid/raid6/x86/sse1.c155
-rw-r--r--lib/raid/raid6/x86/sse2.c480
31 files changed, 7161 insertions, 0 deletions
diff --git a/lib/raid/raid6/.gitignore b/lib/raid/raid6/.gitignore
new file mode 100644
index 000000000000..6be57745afd1
--- /dev/null
+++ b/lib/raid/raid6/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+mktables
+altivec*.c
+int*.c
+tables.c
+neon?.c
+s390vx?.c
+vpermxor*.c
diff --git a/lib/raid/raid6/Makefile b/lib/raid/raid6/Makefile
new file mode 100644
index 000000000000..7cb31b8a5c17
--- /dev/null
+++ b/lib/raid/raid6/Makefile
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -I $(src)
+
+obj-$(CONFIG_RAID6_PQ) += raid6_pq.o tests/
+
+raid6_pq-y += algos.o tables.o
+
+# generic integer generation and recovery implementation
+raid6_pq-y += int1.o int2.o int4.o int8.o
+raid6_pq-y += recov.o
+
+# architecture-specific generation and recovery implementations:
+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += arm/neon.o \
+ arm/neon1.o \
+ arm/neon2.o \
+ arm/neon4.o \
+ arm/neon8.o \
+ arm/recov_neon.o \
+ arm/recov_neon_inner.o
+raid6_pq-$(CONFIG_LOONGARCH) += loongarch/loongarch_simd.o \
+ loongarch/recov_loongarch_simd.o
+raid6_pq-$(CONFIG_ALTIVEC) += powerpc/altivec1.o \
+ powerpc/altivec2.o \
+ powerpc/altivec4.o \
+ powerpc/altivec8.o \
+ powerpc/vpermxor1.o \
+ powerpc/vpermxor2.o \
+ powerpc/vpermxor4.o \
+ powerpc/vpermxor8.o
+raid6_pq-$(CONFIG_RISCV_ISA_V) += riscv/rvv.o \
+ riscv/recov_rvv.o
+raid6_pq-$(CONFIG_S390) += s390/s390vx8.o \
+ s390/recov_s390xc.o
+ifeq ($(CONFIG_X86),y)
+raid6_pq-$(CONFIG_X86_32) += x86/mmx.o \
+ x86/sse1.o
+endif
+raid6_pq-$(CONFIG_X86) += x86/sse2.o \
+ x86/avx2.o \
+ x86/avx512.o \
+ x86/recov_ssse3.o \
+ x86/recov_avx2.o \
+ x86/recov_avx512.o
+
+hostprogs += mktables
+
+CFLAGS_arm/neon1.o += $(CC_FLAGS_FPU)
+CFLAGS_arm/neon2.o += $(CC_FLAGS_FPU)
+CFLAGS_arm/neon4.o += $(CC_FLAGS_FPU)
+CFLAGS_arm/neon8.o += $(CC_FLAGS_FPU)
+CFLAGS_arm/recov_neon_inner.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_arm/neon1.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_arm/neon2.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_arm/neon4.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_arm/neon8.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_arm/recov_neon_inner.o += $(CC_FLAGS_NO_FPU)
+
+ifeq ($(CONFIG_ALTIVEC),y)
+altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
+# Enable <altivec.h>
+altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
+
+CFLAGS_powerpc/altivec1.o += $(altivec_flags)
+CFLAGS_powerpc/altivec2.o += $(altivec_flags)
+CFLAGS_powerpc/altivec4.o += $(altivec_flags)
+CFLAGS_powerpc/altivec8.o += $(altivec_flags)
+CFLAGS_powerpc/vpermxor1.o += $(altivec_flags)
+CFLAGS_powerpc/vpermxor2.o += $(altivec_flags)
+CFLAGS_powerpc/vpermxor4.o += $(altivec_flags)
+CFLAGS_powerpc/vpermxor8.o += $(altivec_flags)
+
+ifdef CONFIG_CC_IS_CLANG
+# clang ppc port does not yet support -maltivec when -msoft-float is
+# enabled. A future release of clang will resolve this
+# https://llvm.org/pr31177
+CFLAGS_REMOVE_powerpc/altivec1.o += -msoft-float
+CFLAGS_REMOVE_powerpc/altivec2.o += -msoft-float
+CFLAGS_REMOVE_powerpc/altivec4.o += -msoft-float
+CFLAGS_REMOVE_powerpc/altivec8.o += -msoft-float
+CFLAGS_REMOVE_powerpc/vpermxor1.o += -msoft-float
+CFLAGS_REMOVE_powerpc/vpermxor2.o += -msoft-float
+CFLAGS_REMOVE_powerpc/vpermxor4.o += -msoft-float
+CFLAGS_REMOVE_powerpc/vpermxor8.o += -msoft-float
+endif # CONFIG_CC_IS_CLANG
+endif # CONFIG_ALTIVEC
+
+quiet_cmd_mktable = TABLE $@
+ cmd_mktable = $(obj)/mktables > $@
+
+targets += tables.c
+$(obj)/tables.c: $(obj)/mktables FORCE
+ $(call if_changed,mktable)
+
+quiet_cmd_unroll = UNROLL $@
+ cmd_unroll = $(AWK) -v N=$* -f $(src)/unroll.awk < $< > $@
+
+targets += int1.c int2.c int4.c int8.c
+$(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE
+ $(call if_changed,unroll)
+
+targets += arm/neon1.c arm/neon2.c arm/neon4.c arm/neon8.c
+$(obj)/arm/neon%.c: $(src)/arm/neon.uc $(src)/unroll.awk FORCE
+ $(call if_changed,unroll)
+
+targets += powerpc/altivec1.c \
+ powerpc/altivec2.c \
+ powerpc/altivec4.c \
+ powerpc/altivec8.c
+$(obj)/powerpc/altivec%.c: $(src)/powerpc/altivec.uc $(src)/unroll.awk FORCE
+ $(call if_changed,unroll)
+
+targets += powerpc/vpermxor1.c \
+ powerpc/vpermxor2.c \
+ powerpc/vpermxor4.c \
+ powerpc/vpermxor8.c
+$(obj)/powerpc/vpermxor%.c: $(src)/powerpc/vpermxor.uc $(src)/unroll.awk FORCE
+ $(call if_changed,unroll)
+
+targets += s390/s390vx8.c
+$(obj)/s390/s390vx%.c: $(src)/s390/s390vx.uc $(src)/unroll.awk FORCE
+ $(call if_changed,unroll)
diff --git a/lib/raid/raid6/algos.c b/lib/raid/raid6/algos.c
new file mode 100644
index 000000000000..985c60bb00a4
--- /dev/null
+++ b/lib/raid/raid6/algos.c
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6/algos.c
+ *
+ * Algorithm list and algorithm selection for RAID-6
+ */
+
+#include <linux/raid/pq.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <kunit/visibility.h>
+
+struct raid6_calls raid6_call;
+EXPORT_SYMBOL_GPL(raid6_call);
+
+const struct raid6_calls * const raid6_algos[] = {
+#if defined(__i386__) && !defined(__arch_um__)
+ &raid6_avx512x2,
+ &raid6_avx512x1,
+ &raid6_avx2x2,
+ &raid6_avx2x1,
+ &raid6_sse2x2,
+ &raid6_sse2x1,
+ &raid6_sse1x2,
+ &raid6_sse1x1,
+ &raid6_mmxx2,
+ &raid6_mmxx1,
+#endif
+#if defined(__x86_64__) && !defined(__arch_um__)
+ &raid6_avx512x4,
+ &raid6_avx512x2,
+ &raid6_avx512x1,
+ &raid6_avx2x4,
+ &raid6_avx2x2,
+ &raid6_avx2x1,
+ &raid6_sse2x4,
+ &raid6_sse2x2,
+ &raid6_sse2x1,
+#endif
+#ifdef CONFIG_ALTIVEC
+ &raid6_vpermxor8,
+ &raid6_vpermxor4,
+ &raid6_vpermxor2,
+ &raid6_vpermxor1,
+ &raid6_altivec8,
+ &raid6_altivec4,
+ &raid6_altivec2,
+ &raid6_altivec1,
+#endif
+#if defined(CONFIG_S390)
+ &raid6_s390vx8,
+#endif
+#ifdef CONFIG_KERNEL_MODE_NEON
+ &raid6_neonx8,
+ &raid6_neonx4,
+ &raid6_neonx2,
+ &raid6_neonx1,
+#endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+ &raid6_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+ &raid6_lsx,
+#endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+ &raid6_rvvx1,
+ &raid6_rvvx2,
+ &raid6_rvvx4,
+ &raid6_rvvx8,
+#endif
+ &raid6_intx8,
+ &raid6_intx4,
+ &raid6_intx2,
+ &raid6_intx1,
+ NULL
+};
+EXPORT_SYMBOL_IF_KUNIT(raid6_algos);
+
+void (*raid6_2data_recov)(int, size_t, int, int, void **);
+EXPORT_SYMBOL_GPL(raid6_2data_recov);
+
+void (*raid6_datap_recov)(int, size_t, int, void **);
+EXPORT_SYMBOL_GPL(raid6_datap_recov);
+
+const struct raid6_recov_calls *const raid6_recov_algos[] = {
+#ifdef CONFIG_X86
+ &raid6_recov_avx512,
+ &raid6_recov_avx2,
+ &raid6_recov_ssse3,
+#endif
+#ifdef CONFIG_S390
+ &raid6_recov_s390xc,
+#endif
+#if defined(CONFIG_KERNEL_MODE_NEON)
+ &raid6_recov_neon,
+#endif
+#ifdef CONFIG_LOONGARCH
+#ifdef CONFIG_CPU_HAS_LASX
+ &raid6_recov_lasx,
+#endif
+#ifdef CONFIG_CPU_HAS_LSX
+ &raid6_recov_lsx,
+#endif
+#endif
+#ifdef CONFIG_RISCV_ISA_V
+ &raid6_recov_rvv,
+#endif
+ &raid6_recov_intx1,
+ NULL
+};
+EXPORT_SYMBOL_IF_KUNIT(raid6_recov_algos);
+
+#define RAID6_TIME_JIFFIES_LG2 4
+#define RAID6_TEST_DISKS 8
+#define RAID6_TEST_DISKS_ORDER 3
+
+static inline const struct raid6_recov_calls *raid6_choose_recov(void)
+{
+ const struct raid6_recov_calls *const *algo;
+ const struct raid6_recov_calls *best;
+
+ for (best = NULL, algo = raid6_recov_algos; *algo; algo++)
+ if (!best || (*algo)->priority > best->priority)
+ if (!(*algo)->valid || (*algo)->valid())
+ best = *algo;
+
+ if (best) {
+ raid6_2data_recov = best->data2;
+ raid6_datap_recov = best->datap;
+
+ pr_info("raid6: using %s recovery algorithm\n", best->name);
+ } else
+ pr_err("raid6: Yikes! No recovery algorithm found!\n");
+
+ return best;
+}
+
+static inline const struct raid6_calls *raid6_choose_gen(
+ void *(*const dptrs)[RAID6_TEST_DISKS], const int disks)
+{
+ unsigned long perf, bestgenperf, j0, j1;
+ int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */
+ const struct raid6_calls *const *algo;
+ const struct raid6_calls *best;
+
+ for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) {
+ if (!best || (*algo)->priority >= best->priority) {
+ if ((*algo)->valid && !(*algo)->valid())
+ continue;
+
+ if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) {
+ best = *algo;
+ break;
+ }
+
+ perf = 0;
+
+ preempt_disable();
+ j0 = jiffies;
+ while ((j1 = jiffies) == j0)
+ cpu_relax();
+ while (time_before(jiffies,
+ j1 + (1<<RAID6_TIME_JIFFIES_LG2))) {
+ (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs);
+ perf++;
+ }
+ preempt_enable();
+
+ if (perf > bestgenperf) {
+ bestgenperf = perf;
+ best = *algo;
+ }
+ pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name,
+ (perf * HZ * (disks-2)) >>
+ (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+ }
+ }
+
+ if (!best) {
+ pr_err("raid6: Yikes! No algorithm found!\n");
+ goto out;
+ }
+
+ raid6_call = *best;
+
+ if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) {
+ pr_info("raid6: skipped pq benchmark and selected %s\n",
+ best->name);
+ goto out;
+ }
+
+ pr_info("raid6: using algorithm %s gen() %ld MB/s\n",
+ best->name,
+ (bestgenperf * HZ * (disks - 2)) >>
+ (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2));
+
+ if (best->xor_syndrome) {
+ perf = 0;
+
+ preempt_disable();
+ j0 = jiffies;
+ while ((j1 = jiffies) == j0)
+ cpu_relax();
+ while (time_before(jiffies,
+ j1 + (1 << RAID6_TIME_JIFFIES_LG2))) {
+ best->xor_syndrome(disks, start, stop,
+ PAGE_SIZE, *dptrs);
+ perf++;
+ }
+ preempt_enable();
+
+ pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n",
+ (perf * HZ * (disks - 2)) >>
+ (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1));
+ }
+
+out:
+ return best;
+}
+
+
+/* Try to pick the best algorithm */
+/* This code uses the gfmul table as convenient data set to abuse */
+
+static int __init raid6_select_algo(void)
+{
+ const int disks = RAID6_TEST_DISKS;
+
+ const struct raid6_calls *gen_best;
+ const struct raid6_recov_calls *rec_best;
+ char *disk_ptr, *p;
+ void *dptrs[RAID6_TEST_DISKS];
+ int i, cycle;
+
+ /* prepare the buffer and fill it circularly with gfmul table */
+ disk_ptr = (char *)__get_free_pages(GFP_KERNEL, RAID6_TEST_DISKS_ORDER);
+ if (!disk_ptr) {
+ pr_err("raid6: Yikes! No memory available.\n");
+ return -ENOMEM;
+ }
+
+ p = disk_ptr;
+ for (i = 0; i < disks; i++)
+ dptrs[i] = p + PAGE_SIZE * i;
+
+ cycle = ((disks - 2) * PAGE_SIZE) / 65536;
+ for (i = 0; i < cycle; i++) {
+ memcpy(p, raid6_gfmul, 65536);
+ p += 65536;
+ }
+
+ if ((disks - 2) * PAGE_SIZE % 65536)
+ memcpy(p, raid6_gfmul, (disks - 2) * PAGE_SIZE % 65536);
+
+ /* select raid gen_syndrome function */
+ gen_best = raid6_choose_gen(&dptrs, disks);
+
+ /* select raid recover functions */
+ rec_best = raid6_choose_recov();
+
+ free_pages((unsigned long)disk_ptr, RAID6_TEST_DISKS_ORDER);
+
+ return gen_best && rec_best ? 0 : -EINVAL;
+}
+
+static void raid6_exit(void)
+{
+ do { } while (0);
+}
+
+subsys_initcall(raid6_select_algo);
+module_exit(raid6_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID6 Q-syndrome calculations");
diff --git a/lib/raid/raid6/arm/neon.c b/lib/raid/raid6/arm/neon.c
new file mode 100644
index 000000000000..47b8bb0afc65
--- /dev/null
+++ b/lib/raid/raid6/arm/neon.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics
+ *
+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/simd.h>
+
+/*
+ * There are 2 reasons these wrappers are kept in a separate compilation unit
+ * from the actual implementations in neonN.c (generated from neon.uc by
+ * unroll.awk):
+ * - the actual implementations use NEON intrinsics, and the GCC support header
+ * (arm_neon.h) is not fully compatible (type wise) with the kernel;
+ * - the neonN.c files are compiled with -mfpu=neon and optimization enabled,
+ * and we have to make sure that we never use *any* NEON/VFP instructions
+ * outside a kernel_neon_begin()/kernel_neon_end() pair.
+ */
+
+#define RAID6_NEON_WRAPPER(_n) \
+ static void raid6_neon ## _n ## _gen_syndrome(int disks, \
+ size_t bytes, void **ptrs) \
+ { \
+ void raid6_neon ## _n ## _gen_syndrome_real(int, \
+ unsigned long, void**); \
+ scoped_ksimd() \
+ raid6_neon ## _n ## _gen_syndrome_real(disks, \
+ (unsigned long)bytes, ptrs); \
+ } \
+ static void raid6_neon ## _n ## _xor_syndrome(int disks, \
+ int start, int stop, \
+ size_t bytes, void **ptrs) \
+ { \
+ void raid6_neon ## _n ## _xor_syndrome_real(int, \
+ int, int, unsigned long, void**); \
+ scoped_ksimd() \
+ raid6_neon ## _n ## _xor_syndrome_real(disks, \
+ start, stop, (unsigned long)bytes, ptrs);\
+ } \
+ struct raid6_calls const raid6_neonx ## _n = { \
+ raid6_neon ## _n ## _gen_syndrome, \
+ raid6_neon ## _n ## _xor_syndrome, \
+ raid6_have_neon, \
+ "neonx" #_n, \
+ 0 \
+ }
+
+static int raid6_have_neon(void)
+{
+ return cpu_has_neon();
+}
+
+RAID6_NEON_WRAPPER(1);
+RAID6_NEON_WRAPPER(2);
+RAID6_NEON_WRAPPER(4);
+RAID6_NEON_WRAPPER(8);
diff --git a/lib/raid/raid6/arm/neon.h b/lib/raid/raid6/arm/neon.h
new file mode 100644
index 000000000000..2ca41ee9b499
--- /dev/null
+++ b/lib/raid/raid6/arm/neon.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon1_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs);
+void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon2_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs);
+void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon4_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs);
+void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs);
+void raid6_neon8_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs);
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+ uint8_t *dq, const uint8_t *pbmul,
+ const uint8_t *qmul);
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+ const uint8_t *qmul);
+
+
diff --git a/lib/raid/raid6/arm/neon.uc b/lib/raid/raid6/arm/neon.uc
new file mode 100644
index 000000000000..14a9fc2c60fa
--- /dev/null
+++ b/lib/raid/raid6/arm/neon.uc
@@ -0,0 +1,153 @@
+/* -----------------------------------------------------------------------
+ *
+ * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
+ *
+ * Copyright (C) 2012 Rob Herring
+ * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * Based on altivec.uc:
+ * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * neon$#.c
+ *
+ * $#-way unrolled NEON intrinsics math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.awk
+ */
+
+#include <arm_neon.h>
+#include "arm/neon.h"
+
+typedef uint8x16_t unative_t;
+
+#define NSIZE sizeof(unative_t)
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline unative_t SHLBYTE(unative_t v)
+{
+ return vshlq_n_u8(v, 1);
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline unative_t MASK(unative_t v)
+{
+ return (unative_t)vshrq_n_s8((int8x16_t)v, 7);
+}
+
+static inline unative_t PMUL(unative_t v, unative_t u)
+{
+ return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u);
+}
+
+void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+ uint8_t **dptr = (uint8_t **)ptrs;
+ uint8_t *p, *q;
+ int d, z, z0;
+
+ register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+ const unative_t x1d = vdupq_n_u8(0x1d);
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+ wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
+ wp$$ = veorq_u8(wp$$, wd$$);
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+
+ w2$$ = vandq_u8(w2$$, x1d);
+ w1$$ = veorq_u8(w1$$, w2$$);
+ wq$$ = veorq_u8(w1$$, wd$$);
+ }
+ vst1q_u8(&p[d+NSIZE*$$], wp$$);
+ vst1q_u8(&q[d+NSIZE*$$], wq$$);
+ }
+}
+
+void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs)
+{
+ uint8_t **dptr = (uint8_t **)ptrs;
+ uint8_t *p, *q;
+ int d, z, z0;
+
+ register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+ const unative_t x1d = vdupq_n_u8(0x1d);
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+ wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
+ wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
+
+ /* P/Q data pages */
+ for ( z = z0-1 ; z >= start ; z-- ) {
+ wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
+ wp$$ = veorq_u8(wp$$, wd$$);
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+
+ w2$$ = vandq_u8(w2$$, x1d);
+ w1$$ = veorq_u8(w1$$, w2$$);
+ wq$$ = veorq_u8(w1$$, wd$$);
+ }
+ /* P/Q left side optimization */
+ for ( z = start-1 ; z >= 3 ; z -= 4 ) {
+ w2$$ = vshrq_n_u8(wq$$, 4);
+ w1$$ = vshlq_n_u8(wq$$, 4);
+
+ w2$$ = PMUL(w2$$, x1d);
+ wq$$ = veorq_u8(w1$$, w2$$);
+ }
+
+ switch (z) {
+ case 2:
+ w2$$ = vshrq_n_u8(wq$$, 5);
+ w1$$ = vshlq_n_u8(wq$$, 3);
+
+ w2$$ = PMUL(w2$$, x1d);
+ wq$$ = veorq_u8(w1$$, w2$$);
+ break;
+ case 1:
+ w2$$ = vshrq_n_u8(wq$$, 6);
+ w1$$ = vshlq_n_u8(wq$$, 2);
+
+ w2$$ = PMUL(w2$$, x1d);
+ wq$$ = veorq_u8(w1$$, w2$$);
+ break;
+ case 0:
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+
+ w2$$ = vandq_u8(w2$$, x1d);
+ wq$$ = veorq_u8(w1$$, w2$$);
+ }
+ w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
+ wq$$ = veorq_u8(wq$$, w1$$);
+
+ vst1q_u8(&p[d+NSIZE*$$], wp$$);
+ vst1q_u8(&q[d+NSIZE*$$], wq$$);
+ }
+}
diff --git a/lib/raid/raid6/arm/recov_neon.c b/lib/raid/raid6/arm/recov_neon.c
new file mode 100644
index 000000000000..5a48fcc762e8
--- /dev/null
+++ b/lib/raid/raid6/arm/recov_neon.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/simd.h>
+#include "arm/neon.h"
+
+static int raid6_has_neon(void)
+{
+ return cpu_has_neon();
+}
+
+static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data pages
+ * Use the dead data pages as temporary storage for
+ * delta p and delta q
+ */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks - 2] = p;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ scoped_ksimd()
+ __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
+}
+
+static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data page
+ * Use the dead data page as temporary storage for delta q
+ */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ scoped_ksimd()
+ __raid6_datap_recov_neon(bytes, p, q, dq, qmul);
+}
+
+const struct raid6_recov_calls raid6_recov_neon = {
+ .data2 = raid6_2data_recov_neon,
+ .datap = raid6_datap_recov_neon,
+ .valid = raid6_has_neon,
+ .name = "neon",
+ .priority = 10,
+};
diff --git a/lib/raid/raid6/arm/recov_neon_inner.c b/lib/raid/raid6/arm/recov_neon_inner.c
new file mode 100644
index 000000000000..53c355efa7ff
--- /dev/null
+++ b/lib/raid/raid6/arm/recov_neon_inner.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <arm_neon.h>
+#include "arm/neon.h"
+
+#ifdef CONFIG_ARM
+/*
+ * AArch32 does not provide this intrinsic natively because it does not
+ * implement the underlying instruction. AArch32 only provides a 64-bit
+ * wide vtbl.8 instruction, so use that instead.
+ */
+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
+{
+ union {
+ uint8x16_t val;
+ uint8x8x2_t pair;
+ } __a = { a };
+
+ return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
+ vtbl2_u8(__a.pair, vget_high_u8(b)));
+}
+#endif
+
+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
+ uint8_t *dq, const uint8_t *pbmul,
+ const uint8_t *qmul)
+{
+ uint8x16_t pm0 = vld1q_u8(pbmul);
+ uint8x16_t pm1 = vld1q_u8(pbmul + 16);
+ uint8x16_t qm0 = vld1q_u8(qmul);
+ uint8x16_t qm1 = vld1q_u8(qmul + 16);
+ uint8x16_t x0f = vdupq_n_u8(0x0f);
+
+ /*
+ * while ( bytes-- ) {
+ * uint8_t px, qx, db;
+ *
+ * px = *p ^ *dp;
+ * qx = qmul[*q ^ *dq];
+ * *dq++ = db = pbmul[px] ^ qx;
+ * *dp++ = db ^ px;
+ * p++; q++;
+ * }
+ */
+
+ while (bytes) {
+ uint8x16_t vx, vy, px, qx, db;
+
+ px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
+ vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+ vy = vshrq_n_u8(vx, 4);
+ vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+ vy = vqtbl1q_u8(qm1, vy);
+ qx = veorq_u8(vx, vy);
+
+ vy = vshrq_n_u8(px, 4);
+ vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
+ vy = vqtbl1q_u8(pm1, vy);
+ vx = veorq_u8(vx, vy);
+ db = veorq_u8(vx, qx);
+
+ vst1q_u8(dq, db);
+ vst1q_u8(dp, veorq_u8(db, px));
+
+ bytes -= 16;
+ p += 16;
+ q += 16;
+ dp += 16;
+ dq += 16;
+ }
+}
+
+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
+ const uint8_t *qmul)
+{
+ uint8x16_t qm0 = vld1q_u8(qmul);
+ uint8x16_t qm1 = vld1q_u8(qmul + 16);
+ uint8x16_t x0f = vdupq_n_u8(0x0f);
+
+ /*
+ * while (bytes--) {
+ * *p++ ^= *dq = qmul[*q ^ *dq];
+ * q++; dq++;
+ * }
+ */
+
+ while (bytes) {
+ uint8x16_t vx, vy;
+
+ vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
+
+ vy = vshrq_n_u8(vx, 4);
+ vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
+ vy = vqtbl1q_u8(qm1, vy);
+ vx = veorq_u8(vx, vy);
+ vy = veorq_u8(vx, vld1q_u8(p));
+
+ vst1q_u8(dq, vx);
+ vst1q_u8(p, vy);
+
+ bytes -= 16;
+ p += 16;
+ q += 16;
+ dq += 16;
+ }
+}
diff --git a/lib/raid/raid6/int.uc b/lib/raid/raid6/int.uc
new file mode 100644
index 000000000000..1ba56c3fa482
--- /dev/null
+++ b/lib/raid/raid6/int.uc
@@ -0,0 +1,147 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * int$#.c
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.awk
+ */
+
+#include <linux/raid/pq.h>
+
+/*
+ * This is the C data type to use
+ */
+
+/* Change this from BITS_PER_LONG if there is something better... */
+#if BITS_PER_LONG == 64
+# define NBYTES(x) ((x) * 0x0101010101010101UL)
+# define NSIZE 8
+# define NSHIFT 3
+# define NSTRING "64"
+typedef u64 unative_t;
+#else
+# define NBYTES(x) ((x) * 0x01010101U)
+# define NSIZE 4
+# define NSHIFT 2
+# define NSTRING "32"
+typedef u32 unative_t;
+#endif
+
+
+
+/*
+ * These sub-operations are separate inlines since they can sometimes be
+ * specially optimized using architecture-specific hacks.
+ */
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+ unative_t vv;
+
+ vv = (v << 1) & NBYTES(0xfe);
+ return vv;
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+ unative_t vv;
+
+ vv = v & NBYTES(0x80);
+ vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
+ return vv;
+}
+
+
+static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+ wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ wp$$ ^= wd$$;
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+ w2$$ &= NBYTES(0x1d);
+ w1$$ ^= w2$$;
+ wq$$ = w1$$ ^ wd$$;
+ }
+ *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ }
+}
+
+static void raid6_int$#_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+ /* P/Q data pages */
+ wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+ for ( z = z0-1 ; z >= start ; z-- ) {
+ wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ wp$$ ^= wd$$;
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+ w2$$ &= NBYTES(0x1d);
+ w1$$ ^= w2$$;
+ wq$$ = w1$$ ^ wd$$;
+ }
+ /* P/Q left side optimization */
+ for ( z = start-1 ; z >= 0 ; z-- ) {
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+ w2$$ &= NBYTES(0x1d);
+ wq$$ = w1$$ ^ w2$$;
+ }
+ *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ }
+
+}
+
+const struct raid6_calls raid6_intx$# = {
+ raid6_int$#_gen_syndrome,
+ raid6_int$#_xor_syndrome,
+ NULL, /* always valid */
+ "int" NSTRING "x$#",
+ 0
+};
diff --git a/lib/raid/raid6/loongarch/loongarch_simd.c b/lib/raid/raid6/loongarch/loongarch_simd.c
new file mode 100644
index 000000000000..72f4d92d4876
--- /dev/null
+++ b/lib/raid/raid6/loongarch/loongarch_simd.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Based on the generic RAID-6 code (int.uc):
+ *
+ * Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/cpu-features.h>
+#include <asm/fpu.h>
+
+/*
+ * The vector algorithms are currently priority 0, which means the generic
+ * scalar algorithms are not being disabled if vector support is present.
+ * This is like the similar LoongArch RAID5 XOR code, with the main reason
+ * repeated here: it cannot be ruled out at this point of time, that some
+ * future (maybe reduced) models could run the vector algorithms slower than
+ * the scalar ones, maybe for errata or micro-op reasons. It may be
+ * appropriate to revisit this after one or two more uarch generations.
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+#define NSIZE 16
+
+static int raid6_has_lsx(void)
+{
+ return cpu_has_lsx;
+}
+
+static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ /*
+ * $vr0, $vr1, $vr2, $vr3: wp
+ * $vr4, $vr5, $vr6, $vr7: wq
+ * $vr8, $vr9, $vr10, $vr11: wd
+ * $vr12, $vr13, $vr14, $vr15: w2
+ * $vr16, $vr17, $vr18, $vr19: w1
+ */
+ for (d = 0; d < bytes; d += NSIZE*4) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+ asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+ asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+ asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+ asm volatile("vori.b $vr4, $vr0, 0");
+ asm volatile("vori.b $vr5, $vr1, 0");
+ asm volatile("vori.b $vr6, $vr2, 0");
+ asm volatile("vori.b $vr7, $vr3, 0");
+ for (z = z0-1; z >= 0; z--) {
+ /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+ asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+ asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+ asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+ asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+ /* wp$$ ^= wd$$; */
+ asm volatile("vxor.v $vr0, $vr0, $vr8");
+ asm volatile("vxor.v $vr1, $vr1, $vr9");
+ asm volatile("vxor.v $vr2, $vr2, $vr10");
+ asm volatile("vxor.v $vr3, $vr3, $vr11");
+ /* w2$$ = MASK(wq$$); */
+ asm volatile("vslti.b $vr12, $vr4, 0");
+ asm volatile("vslti.b $vr13, $vr5, 0");
+ asm volatile("vslti.b $vr14, $vr6, 0");
+ asm volatile("vslti.b $vr15, $vr7, 0");
+ /* w1$$ = SHLBYTE(wq$$); */
+ asm volatile("vslli.b $vr16, $vr4, 1");
+ asm volatile("vslli.b $vr17, $vr5, 1");
+ asm volatile("vslli.b $vr18, $vr6, 1");
+ asm volatile("vslli.b $vr19, $vr7, 1");
+ /* w2$$ &= NBYTES(0x1d); */
+ asm volatile("vandi.b $vr12, $vr12, 0x1d");
+ asm volatile("vandi.b $vr13, $vr13, 0x1d");
+ asm volatile("vandi.b $vr14, $vr14, 0x1d");
+ asm volatile("vandi.b $vr15, $vr15, 0x1d");
+ /* w1$$ ^= w2$$; */
+ asm volatile("vxor.v $vr16, $vr16, $vr12");
+ asm volatile("vxor.v $vr17, $vr17, $vr13");
+ asm volatile("vxor.v $vr18, $vr18, $vr14");
+ asm volatile("vxor.v $vr19, $vr19, $vr15");
+ /* wq$$ = w1$$ ^ wd$$; */
+ asm volatile("vxor.v $vr4, $vr16, $vr8");
+ asm volatile("vxor.v $vr5, $vr17, $vr9");
+ asm volatile("vxor.v $vr6, $vr18, $vr10");
+ asm volatile("vxor.v $vr7, $vr19, $vr11");
+ }
+ /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+ asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
+ asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
+ asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
+ asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
+ /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+ asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
+ asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
+ asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
+ asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ /*
+ * $vr0, $vr1, $vr2, $vr3: wp
+ * $vr4, $vr5, $vr6, $vr7: wq
+ * $vr8, $vr9, $vr10, $vr11: wd
+ * $vr12, $vr13, $vr14, $vr15: w2
+ * $vr16, $vr17, $vr18, $vr19: w1
+ */
+ for (d = 0; d < bytes; d += NSIZE*4) {
+ /* P/Q data pages */
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+ asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+ asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
+ asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
+ asm volatile("vori.b $vr4, $vr0, 0");
+ asm volatile("vori.b $vr5, $vr1, 0");
+ asm volatile("vori.b $vr6, $vr2, 0");
+ asm volatile("vori.b $vr7, $vr3, 0");
+ for (z = z0-1; z >= start; z--) {
+ /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+ asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
+ asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
+ asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
+ asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
+ /* wp$$ ^= wd$$; */
+ asm volatile("vxor.v $vr0, $vr0, $vr8");
+ asm volatile("vxor.v $vr1, $vr1, $vr9");
+ asm volatile("vxor.v $vr2, $vr2, $vr10");
+ asm volatile("vxor.v $vr3, $vr3, $vr11");
+ /* w2$$ = MASK(wq$$); */
+ asm volatile("vslti.b $vr12, $vr4, 0");
+ asm volatile("vslti.b $vr13, $vr5, 0");
+ asm volatile("vslti.b $vr14, $vr6, 0");
+ asm volatile("vslti.b $vr15, $vr7, 0");
+ /* w1$$ = SHLBYTE(wq$$); */
+ asm volatile("vslli.b $vr16, $vr4, 1");
+ asm volatile("vslli.b $vr17, $vr5, 1");
+ asm volatile("vslli.b $vr18, $vr6, 1");
+ asm volatile("vslli.b $vr19, $vr7, 1");
+ /* w2$$ &= NBYTES(0x1d); */
+ asm volatile("vandi.b $vr12, $vr12, 0x1d");
+ asm volatile("vandi.b $vr13, $vr13, 0x1d");
+ asm volatile("vandi.b $vr14, $vr14, 0x1d");
+ asm volatile("vandi.b $vr15, $vr15, 0x1d");
+ /* w1$$ ^= w2$$; */
+ asm volatile("vxor.v $vr16, $vr16, $vr12");
+ asm volatile("vxor.v $vr17, $vr17, $vr13");
+ asm volatile("vxor.v $vr18, $vr18, $vr14");
+ asm volatile("vxor.v $vr19, $vr19, $vr15");
+ /* wq$$ = w1$$ ^ wd$$; */
+ asm volatile("vxor.v $vr4, $vr16, $vr8");
+ asm volatile("vxor.v $vr5, $vr17, $vr9");
+ asm volatile("vxor.v $vr6, $vr18, $vr10");
+ asm volatile("vxor.v $vr7, $vr19, $vr11");
+ }
+
+ /* P/Q left side optimization */
+ for (z = start-1; z >= 0; z--) {
+ /* w2$$ = MASK(wq$$); */
+ asm volatile("vslti.b $vr12, $vr4, 0");
+ asm volatile("vslti.b $vr13, $vr5, 0");
+ asm volatile("vslti.b $vr14, $vr6, 0");
+ asm volatile("vslti.b $vr15, $vr7, 0");
+ /* w1$$ = SHLBYTE(wq$$); */
+ asm volatile("vslli.b $vr16, $vr4, 1");
+ asm volatile("vslli.b $vr17, $vr5, 1");
+ asm volatile("vslli.b $vr18, $vr6, 1");
+ asm volatile("vslli.b $vr19, $vr7, 1");
+ /* w2$$ &= NBYTES(0x1d); */
+ asm volatile("vandi.b $vr12, $vr12, 0x1d");
+ asm volatile("vandi.b $vr13, $vr13, 0x1d");
+ asm volatile("vandi.b $vr14, $vr14, 0x1d");
+ asm volatile("vandi.b $vr15, $vr15, 0x1d");
+ /* wq$$ = w1$$ ^ w2$$; */
+ asm volatile("vxor.v $vr4, $vr16, $vr12");
+ asm volatile("vxor.v $vr5, $vr17, $vr13");
+ asm volatile("vxor.v $vr6, $vr18, $vr14");
+ asm volatile("vxor.v $vr7, $vr19, $vr15");
+ }
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ */
+ asm volatile(
+ "vld $vr20, %0\n\t"
+ "vld $vr21, %1\n\t"
+ "vld $vr22, %2\n\t"
+ "vld $vr23, %3\n\t"
+ "vld $vr24, %4\n\t"
+ "vld $vr25, %5\n\t"
+ "vld $vr26, %6\n\t"
+ "vld $vr27, %7\n\t"
+ "vxor.v $vr20, $vr20, $vr0\n\t"
+ "vxor.v $vr21, $vr21, $vr1\n\t"
+ "vxor.v $vr22, $vr22, $vr2\n\t"
+ "vxor.v $vr23, $vr23, $vr3\n\t"
+ "vxor.v $vr24, $vr24, $vr4\n\t"
+ "vxor.v $vr25, $vr25, $vr5\n\t"
+ "vxor.v $vr26, $vr26, $vr6\n\t"
+ "vxor.v $vr27, $vr27, $vr7\n\t"
+ "vst $vr20, %0\n\t"
+ "vst $vr21, %1\n\t"
+ "vst $vr22, %2\n\t"
+ "vst $vr23, %3\n\t"
+ "vst $vr24, %4\n\t"
+ "vst $vr25, %5\n\t"
+ "vst $vr26, %6\n\t"
+ "vst $vr27, %7\n\t"
+ : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+ "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
+ "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
+ "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
+ );
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lsx = {
+ raid6_lsx_gen_syndrome,
+ raid6_lsx_xor_syndrome,
+ raid6_has_lsx,
+ "lsx",
+ .priority = 0 /* see the comment near the top of the file for reason */
+};
+
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+#define NSIZE 32
+
+static int raid6_has_lasx(void)
+{
+ return cpu_has_lasx;
+}
+
+static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ /*
+ * $xr0, $xr1: wp
+ * $xr2, $xr3: wq
+ * $xr4, $xr5: wd
+ * $xr6, $xr7: w2
+ * $xr8, $xr9: w1
+ */
+ for (d = 0; d < bytes; d += NSIZE*2) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+ asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+ asm volatile("xvori.b $xr2, $xr0, 0");
+ asm volatile("xvori.b $xr3, $xr1, 0");
+ for (z = z0-1; z >= 0; z--) {
+ /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+ asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+ asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+ /* wp$$ ^= wd$$; */
+ asm volatile("xvxor.v $xr0, $xr0, $xr4");
+ asm volatile("xvxor.v $xr1, $xr1, $xr5");
+ /* w2$$ = MASK(wq$$); */
+ asm volatile("xvslti.b $xr6, $xr2, 0");
+ asm volatile("xvslti.b $xr7, $xr3, 0");
+ /* w1$$ = SHLBYTE(wq$$); */
+ asm volatile("xvslli.b $xr8, $xr2, 1");
+ asm volatile("xvslli.b $xr9, $xr3, 1");
+ /* w2$$ &= NBYTES(0x1d); */
+ asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+ asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+ /* w1$$ ^= w2$$; */
+ asm volatile("xvxor.v $xr8, $xr8, $xr6");
+ asm volatile("xvxor.v $xr9, $xr9, $xr7");
+ /* wq$$ = w1$$ ^ wd$$; */
+ asm volatile("xvxor.v $xr2, $xr8, $xr4");
+ asm volatile("xvxor.v $xr3, $xr9, $xr5");
+ }
+ /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
+ asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
+ asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
+ /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
+ asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
+ asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ /*
+ * $xr0, $xr1: wp
+ * $xr2, $xr3: wq
+ * $xr4, $xr5: wd
+ * $xr6, $xr7: w2
+ * $xr8, $xr9: w1
+ */
+ for (d = 0; d < bytes; d += NSIZE*2) {
+ /* P/Q data pages */
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
+ asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
+ asm volatile("xvori.b $xr2, $xr0, 0");
+ asm volatile("xvori.b $xr3, $xr1, 0");
+ for (z = z0-1; z >= start; z--) {
+ /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
+ asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
+ asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
+ /* wp$$ ^= wd$$; */
+ asm volatile("xvxor.v $xr0, $xr0, $xr4");
+ asm volatile("xvxor.v $xr1, $xr1, $xr5");
+ /* w2$$ = MASK(wq$$); */
+ asm volatile("xvslti.b $xr6, $xr2, 0");
+ asm volatile("xvslti.b $xr7, $xr3, 0");
+ /* w1$$ = SHLBYTE(wq$$); */
+ asm volatile("xvslli.b $xr8, $xr2, 1");
+ asm volatile("xvslli.b $xr9, $xr3, 1");
+ /* w2$$ &= NBYTES(0x1d); */
+ asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+ asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+ /* w1$$ ^= w2$$; */
+ asm volatile("xvxor.v $xr8, $xr8, $xr6");
+ asm volatile("xvxor.v $xr9, $xr9, $xr7");
+ /* wq$$ = w1$$ ^ wd$$; */
+ asm volatile("xvxor.v $xr2, $xr8, $xr4");
+ asm volatile("xvxor.v $xr3, $xr9, $xr5");
+ }
+
+ /* P/Q left side optimization */
+ for (z = start-1; z >= 0; z--) {
+ /* w2$$ = MASK(wq$$); */
+ asm volatile("xvslti.b $xr6, $xr2, 0");
+ asm volatile("xvslti.b $xr7, $xr3, 0");
+ /* w1$$ = SHLBYTE(wq$$); */
+ asm volatile("xvslli.b $xr8, $xr2, 1");
+ asm volatile("xvslli.b $xr9, $xr3, 1");
+ /* w2$$ &= NBYTES(0x1d); */
+ asm volatile("xvandi.b $xr6, $xr6, 0x1d");
+ asm volatile("xvandi.b $xr7, $xr7, 0x1d");
+ /* wq$$ = w1$$ ^ w2$$; */
+ asm volatile("xvxor.v $xr2, $xr8, $xr6");
+ asm volatile("xvxor.v $xr3, $xr9, $xr7");
+ }
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ */
+ asm volatile(
+ "xvld $xr10, %0\n\t"
+ "xvld $xr11, %1\n\t"
+ "xvld $xr12, %2\n\t"
+ "xvld $xr13, %3\n\t"
+ "xvxor.v $xr10, $xr10, $xr0\n\t"
+ "xvxor.v $xr11, $xr11, $xr1\n\t"
+ "xvxor.v $xr12, $xr12, $xr2\n\t"
+ "xvxor.v $xr13, $xr13, $xr3\n\t"
+ "xvst $xr10, %0\n\t"
+ "xvst $xr11, %1\n\t"
+ "xvst $xr12, %2\n\t"
+ "xvst $xr13, %3\n\t"
+ : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
+ "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
+ );
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_lasx = {
+ raid6_lasx_gen_syndrome,
+ raid6_lasx_xor_syndrome,
+ raid6_has_lasx,
+ "lasx",
+ .priority = 0 /* see the comment near the top of the file for reason */
+};
+#undef NSIZE
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid/raid6/loongarch/recov_loongarch_simd.c b/lib/raid/raid6/loongarch/recov_loongarch_simd.c
new file mode 100644
index 000000000000..eb3a1e79f01f
--- /dev/null
+++ b/lib/raid/raid6/loongarch/recov_loongarch_simd.c
@@ -0,0 +1,514 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX)
+ *
+ * Copyright (C) 2023 WANG Xuerui <git@xen0n.name>
+ *
+ * Originally based on recov_avx2.c and recov_ssse3.c:
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/cpu-features.h>
+#include <asm/fpu.h>
+
+/*
+ * Unlike with the syndrome calculation algorithms, there's no boot-time
+ * selection of recovery algorithms by benchmarking, so we have to specify
+ * the priorities and hope the future cores will all have decent vector
+ * support (i.e. no LASX slower than LSX, or even scalar code).
+ */
+
+#ifdef CONFIG_CPU_HAS_LSX
+static int raid6_has_lsx(void)
+{
+ return cpu_has_lsx;
+}
+
+static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data pages
+ * Use the dead data pages as temporary storage for
+ * delta p and delta q
+ */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks - 2] = p;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ /*
+ * vr20, vr21: qmul
+ * vr22, vr23: pbmul
+ */
+ asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+ asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+ asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+ asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+
+ while (bytes) {
+ /* vr4 - vr7: Q */
+ asm volatile("vld $vr4, %0" : : "m" (q[0]));
+ asm volatile("vld $vr5, %0" : : "m" (q[16]));
+ asm volatile("vld $vr6, %0" : : "m" (q[32]));
+ asm volatile("vld $vr7, %0" : : "m" (q[48]));
+ /* vr4 - vr7: Q + Qxy */
+ asm volatile("vld $vr8, %0" : : "m" (dq[0]));
+ asm volatile("vld $vr9, %0" : : "m" (dq[16]));
+ asm volatile("vld $vr10, %0" : : "m" (dq[32]));
+ asm volatile("vld $vr11, %0" : : "m" (dq[48]));
+ asm volatile("vxor.v $vr4, $vr4, $vr8");
+ asm volatile("vxor.v $vr5, $vr5, $vr9");
+ asm volatile("vxor.v $vr6, $vr6, $vr10");
+ asm volatile("vxor.v $vr7, $vr7, $vr11");
+ /* vr0 - vr3: P */
+ asm volatile("vld $vr0, %0" : : "m" (p[0]));
+ asm volatile("vld $vr1, %0" : : "m" (p[16]));
+ asm volatile("vld $vr2, %0" : : "m" (p[32]));
+ asm volatile("vld $vr3, %0" : : "m" (p[48]));
+ /* vr0 - vr3: P + Pxy */
+ asm volatile("vld $vr8, %0" : : "m" (dp[0]));
+ asm volatile("vld $vr9, %0" : : "m" (dp[16]));
+ asm volatile("vld $vr10, %0" : : "m" (dp[32]));
+ asm volatile("vld $vr11, %0" : : "m" (dp[48]));
+ asm volatile("vxor.v $vr0, $vr0, $vr8");
+ asm volatile("vxor.v $vr1, $vr1, $vr9");
+ asm volatile("vxor.v $vr2, $vr2, $vr10");
+ asm volatile("vxor.v $vr3, $vr3, $vr11");
+
+ /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */
+ asm volatile("vsrli.b $vr8, $vr4, 4");
+ asm volatile("vsrli.b $vr9, $vr5, 4");
+ asm volatile("vsrli.b $vr10, $vr6, 4");
+ asm volatile("vsrli.b $vr11, $vr7, 4");
+ /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */
+ asm volatile("vandi.b $vr4, $vr4, 0x0f");
+ asm volatile("vandi.b $vr5, $vr5, 0x0f");
+ asm volatile("vandi.b $vr6, $vr6, 0x0f");
+ asm volatile("vandi.b $vr7, $vr7, 0x0f");
+ /* lookup from qmul[0] */
+ asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4");
+ asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5");
+ asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6");
+ asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7");
+ /* lookup from qmul[16] */
+ asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8");
+ asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9");
+ asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10");
+ asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11");
+ /* vr16 - vr19: B(Q + Qxy) */
+ asm volatile("vxor.v $vr16, $vr8, $vr4");
+ asm volatile("vxor.v $vr17, $vr9, $vr5");
+ asm volatile("vxor.v $vr18, $vr10, $vr6");
+ asm volatile("vxor.v $vr19, $vr11, $vr7");
+
+ /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */
+ asm volatile("vsrli.b $vr4, $vr0, 4");
+ asm volatile("vsrli.b $vr5, $vr1, 4");
+ asm volatile("vsrli.b $vr6, $vr2, 4");
+ asm volatile("vsrli.b $vr7, $vr3, 4");
+ /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */
+ asm volatile("vandi.b $vr12, $vr0, 0x0f");
+ asm volatile("vandi.b $vr13, $vr1, 0x0f");
+ asm volatile("vandi.b $vr14, $vr2, 0x0f");
+ asm volatile("vandi.b $vr15, $vr3, 0x0f");
+ /* lookup from pbmul[0] */
+ asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12");
+ asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13");
+ asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14");
+ asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15");
+ /* lookup from pbmul[16] */
+ asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4");
+ asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5");
+ asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6");
+ asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7");
+ /* vr4 - vr7: A(P + Pxy) */
+ asm volatile("vxor.v $vr4, $vr4, $vr12");
+ asm volatile("vxor.v $vr5, $vr5, $vr13");
+ asm volatile("vxor.v $vr6, $vr6, $vr14");
+ asm volatile("vxor.v $vr7, $vr7, $vr15");
+
+ /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */
+ asm volatile("vxor.v $vr4, $vr4, $vr16");
+ asm volatile("vxor.v $vr5, $vr5, $vr17");
+ asm volatile("vxor.v $vr6, $vr6, $vr18");
+ asm volatile("vxor.v $vr7, $vr7, $vr19");
+ asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+ asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+ asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+ asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+ /* vr0 - vr3: P + Pxy + Dx = Dy */
+ asm volatile("vxor.v $vr0, $vr0, $vr4");
+ asm volatile("vxor.v $vr1, $vr1, $vr5");
+ asm volatile("vxor.v $vr2, $vr2, $vr6");
+ asm volatile("vxor.v $vr3, $vr3, $vr7");
+ asm volatile("vst $vr0, %0" : "=m" (dp[0]));
+ asm volatile("vst $vr1, %0" : "=m" (dp[16]));
+ asm volatile("vst $vr2, %0" : "=m" (dp[32]));
+ asm volatile("vst $vr3, %0" : "=m" (dp[48]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dp += 64;
+ dq += 64;
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data page
+ * Use the dead data page as temporary storage for delta q
+ */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ /* vr22, vr23: qmul */
+ asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+ asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+
+ while (bytes) {
+ /* vr0 - vr3: P + Dx */
+ asm volatile("vld $vr0, %0" : : "m" (p[0]));
+ asm volatile("vld $vr1, %0" : : "m" (p[16]));
+ asm volatile("vld $vr2, %0" : : "m" (p[32]));
+ asm volatile("vld $vr3, %0" : : "m" (p[48]));
+ /* vr4 - vr7: Qx */
+ asm volatile("vld $vr4, %0" : : "m" (dq[0]));
+ asm volatile("vld $vr5, %0" : : "m" (dq[16]));
+ asm volatile("vld $vr6, %0" : : "m" (dq[32]));
+ asm volatile("vld $vr7, %0" : : "m" (dq[48]));
+ /* vr4 - vr7: Q + Qx */
+ asm volatile("vld $vr8, %0" : : "m" (q[0]));
+ asm volatile("vld $vr9, %0" : : "m" (q[16]));
+ asm volatile("vld $vr10, %0" : : "m" (q[32]));
+ asm volatile("vld $vr11, %0" : : "m" (q[48]));
+ asm volatile("vxor.v $vr4, $vr4, $vr8");
+ asm volatile("vxor.v $vr5, $vr5, $vr9");
+ asm volatile("vxor.v $vr6, $vr6, $vr10");
+ asm volatile("vxor.v $vr7, $vr7, $vr11");
+
+ /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */
+ asm volatile("vsrli.b $vr8, $vr4, 4");
+ asm volatile("vsrli.b $vr9, $vr5, 4");
+ asm volatile("vsrli.b $vr10, $vr6, 4");
+ asm volatile("vsrli.b $vr11, $vr7, 4");
+ /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */
+ asm volatile("vandi.b $vr4, $vr4, 0x0f");
+ asm volatile("vandi.b $vr5, $vr5, 0x0f");
+ asm volatile("vandi.b $vr6, $vr6, 0x0f");
+ asm volatile("vandi.b $vr7, $vr7, 0x0f");
+ /* lookup from qmul[0] */
+ asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4");
+ asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5");
+ asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6");
+ asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7");
+ /* lookup from qmul[16] */
+ asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8");
+ asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9");
+ asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10");
+ asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11");
+ /* vr4 - vr7: qmul(Q + Qx) = Dx */
+ asm volatile("vxor.v $vr4, $vr4, $vr8");
+ asm volatile("vxor.v $vr5, $vr5, $vr9");
+ asm volatile("vxor.v $vr6, $vr6, $vr10");
+ asm volatile("vxor.v $vr7, $vr7, $vr11");
+ asm volatile("vst $vr4, %0" : "=m" (dq[0]));
+ asm volatile("vst $vr5, %0" : "=m" (dq[16]));
+ asm volatile("vst $vr6, %0" : "=m" (dq[32]));
+ asm volatile("vst $vr7, %0" : "=m" (dq[48]));
+
+ /* vr0 - vr3: P + Dx + Dx = P */
+ asm volatile("vxor.v $vr0, $vr0, $vr4");
+ asm volatile("vxor.v $vr1, $vr1, $vr5");
+ asm volatile("vxor.v $vr2, $vr2, $vr6");
+ asm volatile("vxor.v $vr3, $vr3, $vr7");
+ asm volatile("vst $vr0, %0" : "=m" (p[0]));
+ asm volatile("vst $vr1, %0" : "=m" (p[16]));
+ asm volatile("vst $vr2, %0" : "=m" (p[32]));
+ asm volatile("vst $vr3, %0" : "=m" (p[48]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dq += 64;
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lsx = {
+ .data2 = raid6_2data_recov_lsx,
+ .datap = raid6_datap_recov_lsx,
+ .valid = raid6_has_lsx,
+ .name = "lsx",
+ .priority = 1,
+};
+#endif /* CONFIG_CPU_HAS_LSX */
+
+#ifdef CONFIG_CPU_HAS_LASX
+static int raid6_has_lasx(void)
+{
+ return cpu_has_lasx;
+}
+
+static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data pages
+ * Use the dead data pages as temporary storage for
+ * delta p and delta q
+ */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks - 2] = p;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ /*
+ * xr20, xr21: qmul
+ * xr22, xr23: pbmul
+ */
+ asm volatile("vld $vr20, %0" : : "m" (qmul[0]));
+ asm volatile("vld $vr21, %0" : : "m" (qmul[16]));
+ asm volatile("vld $vr22, %0" : : "m" (pbmul[0]));
+ asm volatile("vld $vr23, %0" : : "m" (pbmul[16]));
+ asm volatile("xvreplve0.q $xr20, $xr20");
+ asm volatile("xvreplve0.q $xr21, $xr21");
+ asm volatile("xvreplve0.q $xr22, $xr22");
+ asm volatile("xvreplve0.q $xr23, $xr23");
+
+ while (bytes) {
+ /* xr0, xr1: Q */
+ asm volatile("xvld $xr0, %0" : : "m" (q[0]));
+ asm volatile("xvld $xr1, %0" : : "m" (q[32]));
+ /* xr0, xr1: Q + Qxy */
+ asm volatile("xvld $xr4, %0" : : "m" (dq[0]));
+ asm volatile("xvld $xr5, %0" : : "m" (dq[32]));
+ asm volatile("xvxor.v $xr0, $xr0, $xr4");
+ asm volatile("xvxor.v $xr1, $xr1, $xr5");
+ /* xr2, xr3: P */
+ asm volatile("xvld $xr2, %0" : : "m" (p[0]));
+ asm volatile("xvld $xr3, %0" : : "m" (p[32]));
+ /* xr2, xr3: P + Pxy */
+ asm volatile("xvld $xr4, %0" : : "m" (dp[0]));
+ asm volatile("xvld $xr5, %0" : : "m" (dp[32]));
+ asm volatile("xvxor.v $xr2, $xr2, $xr4");
+ asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+ /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */
+ asm volatile("xvsrli.b $xr4, $xr0, 4");
+ asm volatile("xvsrli.b $xr5, $xr1, 4");
+ /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */
+ asm volatile("xvandi.b $xr0, $xr0, 0x0f");
+ asm volatile("xvandi.b $xr1, $xr1, 0x0f");
+ /* lookup from qmul[0] */
+ asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0");
+ asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1");
+ /* lookup from qmul[16] */
+ asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4");
+ asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5");
+ /* xr6, xr7: B(Q + Qxy) */
+ asm volatile("xvxor.v $xr6, $xr4, $xr0");
+ asm volatile("xvxor.v $xr7, $xr5, $xr1");
+
+ /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */
+ asm volatile("xvsrli.b $xr4, $xr2, 4");
+ asm volatile("xvsrli.b $xr5, $xr3, 4");
+ /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */
+ asm volatile("xvandi.b $xr0, $xr2, 0x0f");
+ asm volatile("xvandi.b $xr1, $xr3, 0x0f");
+ /* lookup from pbmul[0] */
+ asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0");
+ asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1");
+ /* lookup from pbmul[16] */
+ asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+ asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+ /* xr0, xr1: A(P + Pxy) */
+ asm volatile("xvxor.v $xr0, $xr0, $xr4");
+ asm volatile("xvxor.v $xr1, $xr1, $xr5");
+
+ /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */
+ asm volatile("xvxor.v $xr0, $xr0, $xr6");
+ asm volatile("xvxor.v $xr1, $xr1, $xr7");
+
+ /* xr2, xr3: P + Pxy + Dx = Dy */
+ asm volatile("xvxor.v $xr2, $xr2, $xr0");
+ asm volatile("xvxor.v $xr3, $xr3, $xr1");
+
+ asm volatile("xvst $xr0, %0" : "=m" (dq[0]));
+ asm volatile("xvst $xr1, %0" : "=m" (dq[32]));
+ asm volatile("xvst $xr2, %0" : "=m" (dp[0]));
+ asm volatile("xvst $xr3, %0" : "=m" (dp[32]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dp += 64;
+ dq += 64;
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data page
+ * Use the dead data page as temporary storage for delta q
+ */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ /* xr22, xr23: qmul */
+ asm volatile("vld $vr22, %0" : : "m" (qmul[0]));
+ asm volatile("xvreplve0.q $xr22, $xr22");
+ asm volatile("vld $vr23, %0" : : "m" (qmul[16]));
+ asm volatile("xvreplve0.q $xr23, $xr23");
+
+ while (bytes) {
+ /* xr0, xr1: P + Dx */
+ asm volatile("xvld $xr0, %0" : : "m" (p[0]));
+ asm volatile("xvld $xr1, %0" : : "m" (p[32]));
+ /* xr2, xr3: Qx */
+ asm volatile("xvld $xr2, %0" : : "m" (dq[0]));
+ asm volatile("xvld $xr3, %0" : : "m" (dq[32]));
+ /* xr2, xr3: Q + Qx */
+ asm volatile("xvld $xr4, %0" : : "m" (q[0]));
+ asm volatile("xvld $xr5, %0" : : "m" (q[32]));
+ asm volatile("xvxor.v $xr2, $xr2, $xr4");
+ asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+ /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */
+ asm volatile("xvsrli.b $xr4, $xr2, 4");
+ asm volatile("xvsrli.b $xr5, $xr3, 4");
+ /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */
+ asm volatile("xvandi.b $xr2, $xr2, 0x0f");
+ asm volatile("xvandi.b $xr3, $xr3, 0x0f");
+ /* lookup from qmul[0] */
+ asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2");
+ asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3");
+ /* lookup from qmul[16] */
+ asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4");
+ asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5");
+ /* xr2, xr3: qmul(Q + Qx) = Dx */
+ asm volatile("xvxor.v $xr2, $xr2, $xr4");
+ asm volatile("xvxor.v $xr3, $xr3, $xr5");
+
+ /* xr0, xr1: P + Dx + Dx = P */
+ asm volatile("xvxor.v $xr0, $xr0, $xr2");
+ asm volatile("xvxor.v $xr1, $xr1, $xr3");
+
+ asm volatile("xvst $xr2, %0" : "=m" (dq[0]));
+ asm volatile("xvst $xr3, %0" : "=m" (dq[32]));
+ asm volatile("xvst $xr0, %0" : "=m" (p[0]));
+ asm volatile("xvst $xr1, %0" : "=m" (p[32]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dq += 64;
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_lasx = {
+ .data2 = raid6_2data_recov_lasx,
+ .datap = raid6_datap_recov_lasx,
+ .valid = raid6_has_lasx,
+ .name = "lasx",
+ .priority = 2,
+};
+#endif /* CONFIG_CPU_HAS_LASX */
diff --git a/lib/raid/raid6/mktables.c b/lib/raid/raid6/mktables.c
new file mode 100644
index 000000000000..3de1dbf6846c
--- /dev/null
+++ b/lib/raid/raid6/mktables.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * mktables.c
+ *
+ * Make RAID-6 tables. This is a host user space program to be run at
+ * compile time.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <time.h>
+
+static uint8_t gfmul(uint8_t a, uint8_t b)
+{
+ uint8_t v = 0;
+
+ while (b) {
+ if (b & 1)
+ v ^= a;
+ a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+ b >>= 1;
+ }
+
+ return v;
+}
+
+static uint8_t gfpow(uint8_t a, int b)
+{
+ uint8_t v = 1;
+
+ b %= 255;
+ if (b < 0)
+ b += 255;
+
+ while (b) {
+ if (b & 1)
+ v = gfmul(v, a);
+ a = gfmul(a, a);
+ b >>= 1;
+ }
+
+ return v;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, j, k;
+ uint8_t v;
+ uint8_t exptbl[256], invtbl[256];
+
+ printf("#include <linux/export.h>\n");
+ printf("#include <linux/raid/pq.h>\n");
+
+ /* Compute multiplication table */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfmul[256][256] =\n"
+ "{\n");
+ for (i = 0; i < 256; i++) {
+ printf("\t{\n");
+ for (j = 0; j < 256; j += 8) {
+ printf("\t\t");
+ for (k = 0; k < 8; k++)
+ printf("0x%02x,%c", gfmul(i, j + k),
+ (k == 7) ? '\n' : ' ');
+ }
+ printf("\t},\n");
+ }
+ printf("};\n");
+ printf("EXPORT_SYMBOL(raid6_gfmul);\n");
+
+ /* Compute vector multiplication table */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_vgfmul[256][32] =\n"
+ "{\n");
+ for (i = 0; i < 256; i++) {
+ printf("\t{\n");
+ for (j = 0; j < 16; j += 8) {
+ printf("\t\t");
+ for (k = 0; k < 8; k++)
+ printf("0x%02x,%c", gfmul(i, j + k),
+ (k == 7) ? '\n' : ' ');
+ }
+ for (j = 0; j < 16; j += 8) {
+ printf("\t\t");
+ for (k = 0; k < 8; k++)
+ printf("0x%02x,%c", gfmul(i, (j + k) << 4),
+ (k == 7) ? '\n' : ' ');
+ }
+ printf("\t},\n");
+ }
+ printf("};\n");
+ printf("EXPORT_SYMBOL(raid6_vgfmul);\n");
+
+ /* Compute power-of-2 table (exponent) */
+ v = 1;
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfexp[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++) {
+ exptbl[i + j] = v;
+ printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+ v = gfmul(v, 2);
+ if (v == 1)
+ v = 0; /* For entry 255, not a real entry */
+ }
+ }
+ printf("};\n");
+ printf("EXPORT_SYMBOL(raid6_gfexp);\n");
+
+ /* Compute log-of-2 table */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gflog[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++) {
+ v = 255;
+ for (k = 0; k < 256; k++)
+ if (exptbl[k] == (i + j)) {
+ v = k;
+ break;
+ }
+ printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+ }
+ }
+ printf("};\n");
+ printf("EXPORT_SYMBOL(raid6_gflog);\n");
+
+ /* Compute inverse table x^-1 == x^254 */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfinv[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++) {
+ invtbl[i + j] = v = gfpow(i + j, 254);
+ printf("0x%02x,%c", v, (j == 7) ? '\n' : ' ');
+ }
+ }
+ printf("};\n");
+ printf("EXPORT_SYMBOL(raid6_gfinv);\n");
+
+ /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
+ printf("\nconst u8 __attribute__((aligned(256)))\n"
+ "raid6_gfexi[256] =\n" "{\n");
+ for (i = 0; i < 256; i += 8) {
+ printf("\t");
+ for (j = 0; j < 8; j++)
+ printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1],
+ (j == 7) ? '\n' : ' ');
+ }
+ printf("};\n");
+ printf("EXPORT_SYMBOL(raid6_gfexi);\n");
+
+ return 0;
+}
diff --git a/lib/raid/raid6/powerpc/altivec.uc b/lib/raid/raid6/powerpc/altivec.uc
new file mode 100644
index 000000000000..130d3d3dd42c
--- /dev/null
+++ b/lib/raid/raid6/powerpc/altivec.uc
@@ -0,0 +1,122 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6altivec$#.c
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file is postprocessed using unroll.awk
+ *
+ * <benh> hpa: in process,
+ * you can just "steal" the vec unit with enable_kernel_altivec() (but
+ * bracked this with preempt_disable/enable or in a lock)
+ */
+
+#include <linux/raid/pq.h>
+
+#include <altivec.h>
+#include <asm/cputable.h>
+#include <asm/switch_to.h>
+
+/*
+ * This is the C data type to use. We use a vector of
+ * signed char so vec_cmpgt() will generate the right
+ * instruction.
+ */
+
+typedef vector signed char unative_t;
+
+#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
+#define NSIZE sizeof(unative_t)
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+ return vec_add(v,v);
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+ unative_t zv = NBYTES(0);
+
+ /* vec_cmpgt returns a vector bool char; thus the need for the cast */
+ return (unative_t)vec_cmpgt(zv, v);
+}
+
+
+/* This is noinline to make damned sure that gcc doesn't move any of the
+ Altivec code around the enable/disable code */
+static void noinline
+raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+ unative_t x1d = NBYTES(0x1d);
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+ wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ wp$$ = vec_xor(wp$$, wd$$);
+ w2$$ = MASK(wq$$);
+ w1$$ = SHLBYTE(wq$$);
+ w2$$ = vec_and(w2$$, x1d);
+ w1$$ = vec_xor(w1$$, w2$$);
+ wq$$ = vec_xor(w1$$, wd$$);
+ }
+ *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ }
+}
+
+static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ preempt_disable();
+ enable_kernel_altivec();
+
+ raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
+
+ disable_kernel_altivec();
+ preempt_enable();
+}
+
+int raid6_have_altivec(void);
+#if $# == 1
+int raid6_have_altivec(void)
+{
+ /* This assumes either all CPUs have Altivec or none does */
+ return cpu_has_feature(CPU_FTR_ALTIVEC);
+}
+#endif
+
+const struct raid6_calls raid6_altivec$# = {
+ raid6_altivec$#_gen_syndrome,
+ NULL, /* XOR not yet implemented */
+ raid6_have_altivec,
+ "altivecx$#",
+ 0
+};
diff --git a/lib/raid/raid6/powerpc/vpermxor.uc b/lib/raid/raid6/powerpc/vpermxor.uc
new file mode 100644
index 000000000000..595f20aaf4cf
--- /dev/null
+++ b/lib/raid/raid6/powerpc/vpermxor.uc
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2017, Matt Brown, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * vpermxor$#.c
+ *
+ * Based on H. Peter Anvin's paper - The mathematics of RAID-6
+ *
+ * $#-way unrolled portable integer math RAID-6 instruction set
+ * This file is postprocessed using unroll.awk
+ *
+ * vpermxor$#.c makes use of the vpermxor instruction to optimise the RAID6 Q
+ * syndrome calculations.
+ * This can be run on systems which have both Altivec and vpermxor instruction.
+ *
+ * This instruction was introduced in POWER8 - ISA v2.07.
+ */
+
+#include <linux/raid/pq.h>
+#include <altivec.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+#include <asm/switch_to.h>
+
+typedef vector unsigned char unative_t;
+#define NSIZE sizeof(unative_t)
+
+static const vector unsigned char gf_low = {0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14,
+ 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08,
+ 0x06, 0x04, 0x02,0x00};
+static const vector unsigned char gf_high = {0xfd, 0xdd, 0xbd, 0x9d, 0x7d, 0x5d,
+ 0x3d, 0x1d, 0xe0, 0xc0, 0xa0, 0x80,
+ 0x60, 0x40, 0x20, 0x00};
+
+static void noinline raid6_vpermxor$#_gen_syndrome_real(int disks, size_t bytes,
+ void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+ unative_t wp$$, wq$$, wd$$;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ for (d = 0; d < bytes; d += NSIZE*$#) {
+ wp$$ = wq$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
+
+ for (z = z0-1; z>=0; z--) {
+ wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ /* P syndrome */
+ wp$$ = vec_xor(wp$$, wd$$);
+
+ /* Q syndrome */
+ asm(VPERMXOR(%0,%1,%2,%3):"=v"(wq$$):"v"(gf_high), "v"(gf_low), "v"(wq$$));
+ wq$$ = vec_xor(wq$$, wd$$);
+ }
+ *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ }
+}
+
+static void raid6_vpermxor$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ preempt_disable();
+ enable_kernel_altivec();
+
+ raid6_vpermxor$#_gen_syndrome_real(disks, bytes, ptrs);
+
+ disable_kernel_altivec();
+ preempt_enable();
+}
+
+int raid6_have_altivec_vpermxor(void);
+#if $# == 1
+int raid6_have_altivec_vpermxor(void)
+{
+ /* Check if arch has both altivec and the vpermxor instructions */
+ return (cpu_has_feature(CPU_FTR_ALTIVEC_COMP) &&
+ cpu_has_feature(CPU_FTR_ARCH_207S));
+}
+#endif
+
+const struct raid6_calls raid6_vpermxor$# = {
+ raid6_vpermxor$#_gen_syndrome,
+ NULL,
+ raid6_have_altivec_vpermxor,
+ "vpermxor$#",
+ 0
+};
diff --git a/lib/raid/raid6/recov.c b/lib/raid/raid6/recov.c
new file mode 100644
index 000000000000..8d113196632e
--- /dev/null
+++ b/lib/raid/raid6/recov.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6/recov.c
+ *
+ * RAID-6 data recovery in dual failure mode. In single failure mode,
+ * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct
+ * the syndrome.)
+ */
+
+#include <linux/raid/pq.h>
+
+/* Recover two failed data blocks. */
+static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ u8 px, qx, db;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+ /* Now do it... */
+ while ( bytes-- ) {
+ px = *p ^ *dp;
+ qx = qmul[*q ^ *dq];
+ *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
+ *dp++ = db ^ px; /* Reconstructed A */
+ p++; q++;
+ }
+}
+
+/* Recover failure of one data block plus the P block */
+static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ /* Now do it... */
+ while ( bytes-- ) {
+ *p++ ^= *dq = qmul[*q ^ *dq];
+ q++; dq++;
+ }
+}
+
+
+const struct raid6_recov_calls raid6_recov_intx1 = {
+ .data2 = raid6_2data_recov_intx1,
+ .datap = raid6_datap_recov_intx1,
+ .valid = NULL,
+ .name = "intx1",
+ .priority = 0,
+};
diff --git a/lib/raid/raid6/riscv/recov_rvv.c b/lib/raid/raid6/riscv/recov_rvv.c
new file mode 100644
index 000000000000..40c393206b6a
--- /dev/null
+++ b/lib/raid/raid6/riscv/recov_rvv.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ */
+
+#include <linux/raid/pq.h>
+#include "rvv.h"
+
+static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp,
+ u8 *dq, const u8 *pbmul,
+ const u8 *qmul)
+{
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli x0, %[avl], e8, m1, ta, ma\n"
+ ".option pop\n"
+ : :
+ [avl]"r"(16)
+ );
+
+ /*
+ * while ( bytes-- ) {
+ * uint8_t px, qx, db;
+ *
+ * px = *p ^ *dp;
+ * qx = qmul[*q ^ *dq];
+ * *dq++ = db = pbmul[px] ^ qx;
+ * *dp++ = db ^ px;
+ * p++; q++;
+ * }
+ */
+ while (bytes) {
+ /*
+ * v0:px, v1:dp,
+ * v2:qx, v3:dq,
+ * v4:vx, v5:vy,
+ * v6:qm0, v7:qm1,
+ * v8:pm0, v9:pm1,
+ * v14:p/qm[vx], v15:p/qm[vy]
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[px])\n"
+ "vle8.v v1, (%[dp])\n"
+ "vxor.vv v0, v0, v1\n"
+ "vle8.v v2, (%[qx])\n"
+ "vle8.v v3, (%[dq])\n"
+ "vxor.vv v4, v2, v3\n"
+ "vsrl.vi v5, v4, 4\n"
+ "vand.vi v4, v4, 0xf\n"
+ "vle8.v v6, (%[qm0])\n"
+ "vle8.v v7, (%[qm1])\n"
+ "vrgather.vv v14, v6, v4\n" /* v14 = qm[vx] */
+ "vrgather.vv v15, v7, v5\n" /* v15 = qm[vy] */
+ "vxor.vv v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */
+
+ "vsrl.vi v5, v0, 4\n"
+ "vand.vi v4, v0, 0xf\n"
+ "vle8.v v8, (%[pm0])\n"
+ "vle8.v v9, (%[pm1])\n"
+ "vrgather.vv v14, v8, v4\n" /* v14 = pm[vx] */
+ "vrgather.vv v15, v9, v5\n" /* v15 = pm[vy] */
+ "vxor.vv v4, v14, v15\n" /* v4 = pbmul[px] */
+ "vxor.vv v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */
+ "vxor.vv v1, v3, v0\n" /* v1 = db ^ px; */
+ "vse8.v v3, (%[dq])\n"
+ "vse8.v v1, (%[dp])\n"
+ ".option pop\n"
+ : :
+ [px]"r"(p),
+ [dp]"r"(dp),
+ [qx]"r"(q),
+ [dq]"r"(dq),
+ [qm0]"r"(qmul),
+ [qm1]"r"(qmul + 16),
+ [pm0]"r"(pbmul),
+ [pm1]"r"(pbmul + 16)
+ :);
+
+ bytes -= 16;
+ p += 16;
+ q += 16;
+ dp += 16;
+ dq += 16;
+ }
+}
+
+static void __raid6_datap_recov_rvv(int bytes, u8 *p, u8 *q,
+ u8 *dq, const u8 *qmul)
+{
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli x0, %[avl], e8, m1, ta, ma\n"
+ ".option pop\n"
+ : :
+ [avl]"r"(16)
+ );
+
+ /*
+ * while (bytes--) {
+ * *p++ ^= *dq = qmul[*q ^ *dq];
+ * q++; dq++;
+ * }
+ */
+ while (bytes) {
+ /*
+ * v0:vx, v1:vy,
+ * v2:dq, v3:p,
+ * v4:qm0, v5:qm1,
+ * v10:m[vx], v11:m[vy]
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[vx])\n"
+ "vle8.v v2, (%[dq])\n"
+ "vxor.vv v0, v0, v2\n"
+ "vsrl.vi v1, v0, 4\n"
+ "vand.vi v0, v0, 0xf\n"
+ "vle8.v v4, (%[qm0])\n"
+ "vle8.v v5, (%[qm1])\n"
+ "vrgather.vv v10, v4, v0\n"
+ "vrgather.vv v11, v5, v1\n"
+ "vxor.vv v0, v10, v11\n"
+ "vle8.v v1, (%[vy])\n"
+ "vxor.vv v1, v0, v1\n"
+ "vse8.v v0, (%[dq])\n"
+ "vse8.v v1, (%[vy])\n"
+ ".option pop\n"
+ : :
+ [vx]"r"(q),
+ [vy]"r"(p),
+ [dq]"r"(dq),
+ [qm0]"r"(qmul),
+ [qm1]"r"(qmul + 16)
+ :);
+
+ bytes -= 16;
+ p += 16;
+ q += 16;
+ dq += 16;
+ }
+}
+
+static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data pages
+ * Use the dead data pages as temporary storage for
+ * delta p and delta q
+ */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks - 2] = p;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ kernel_vector_begin();
+ __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul);
+ kernel_vector_end();
+}
+
+static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+
+ p = (u8 *)ptrs[disks - 2];
+ q = (u8 *)ptrs[disks - 1];
+
+ /*
+ * Compute syndrome with zero for the missing data page
+ * Use the dead data page as temporary storage for delta q
+ */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks - 1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks - 1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_vector_begin();
+ __raid6_datap_recov_rvv(bytes, p, q, dq, qmul);
+ kernel_vector_end();
+}
+
+const struct raid6_recov_calls raid6_recov_rvv = {
+ .data2 = raid6_2data_recov_rvv,
+ .datap = raid6_datap_recov_rvv,
+ .valid = rvv_has_vector,
+ .name = "rvv",
+ .priority = 1,
+};
diff --git a/lib/raid/raid6/riscv/rvv.c b/lib/raid/raid6/riscv/rvv.c
new file mode 100644
index 000000000000..75c9dafedb28
--- /dev/null
+++ b/lib/raid/raid6/riscv/rvv.c
@@ -0,0 +1,1228 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RAID-6 syndrome calculation using RISC-V vector instructions
+ *
+ * Copyright 2024 Institute of Software, CAS.
+ * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+ *
+ * Based on neon.uc:
+ * Copyright 2002-2004 H. Peter Anvin
+ */
+
+#include "rvv.h"
+
+#ifdef __riscv_vector
+#error "This code must be built without compiler support for vector"
+#endif
+
+static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0 + 1]; /* XOR parity */
+ q = dptr[z0 + 2]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+ for (d = 0; d < bytes; d += nsize * 1) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize])
+ );
+
+ for (z = z0 - 1 ; z >= 0 ; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vse8.v v0, (%[wp0])\n"
+ "vse8.v v1, (%[wq0])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0])
+ );
+ }
+}
+
+static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks - 2]; /* XOR parity */
+ q = dptr[disks - 1]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */
+ for (d = 0 ; d < bytes ; d += nsize * 1) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize])
+ );
+
+ /* P/Q data pages */
+ for (z = z0 - 1; z >= start; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /* P/Q left side optimization */
+ for (z = start - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * wq$$ = w1$$ ^ w2$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v1, v3, v2\n"
+ ".option pop\n"
+ : :
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ * v0:wp0, v1:wq0, v2:p0, v3:q0
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v2, (%[wp0])\n"
+ "vle8.v v3, (%[wq0])\n"
+ "vxor.vv v2, v2, v0\n"
+ "vxor.vv v3, v3, v1\n"
+ "vse8.v v2, (%[wp0])\n"
+ "vse8.v v3, (%[wq0])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0])
+ );
+ }
+}
+
+static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0 + 1]; /* XOR parity */
+ q = dptr[z0 + 2]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /*
+ * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+ * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+ */
+ for (d = 0; d < bytes; d += nsize * 2) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ "vle8.v v4, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+ [wp1]"r"(&dptr[z0][d + 1 * nsize])
+ );
+
+ for (z = z0 - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v7, v7, v6\n"
+ "vle8.v v6, (%[wd1])\n"
+ "vxor.vv v5, v7, v6\n"
+ "vxor.vv v4, v4, v6\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [wd1]"r"(&dptr[z][d + 1 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vse8.v v0, (%[wp0])\n"
+ "vse8.v v1, (%[wq0])\n"
+ "vse8.v v4, (%[wp1])\n"
+ "vse8.v v5, (%[wq1])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0]),
+ [wp1]"r"(&p[d + nsize * 1]),
+ [wq1]"r"(&q[d + nsize * 1])
+ );
+ }
+}
+
+static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks - 2]; /* XOR parity */
+ q = dptr[disks - 1]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /*
+ * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+ * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+ */
+ for (d = 0; d < bytes; d += nsize * 2) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ "vle8.v v4, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+ [wp1]"r"(&dptr[z0][d + 1 * nsize])
+ );
+
+ /* P/Q data pages */
+ for (z = z0 - 1; z >= start; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v7, v7, v6\n"
+ "vle8.v v6, (%[wd1])\n"
+ "vxor.vv v5, v7, v6\n"
+ "vxor.vv v4, v4, v6\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [wd1]"r"(&dptr[z][d + 1 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /* P/Q left side optimization */
+ for (z = start - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * wq$$ = w1$$ ^ w2$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v1, v3, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v5, v7, v6\n"
+ ".option pop\n"
+ : :
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ * v0:wp0, v1:wq0, v2:p0, v3:q0
+ * v4:wp1, v5:wq1, v6:p1, v7:q1
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v2, (%[wp0])\n"
+ "vle8.v v3, (%[wq0])\n"
+ "vxor.vv v2, v2, v0\n"
+ "vxor.vv v3, v3, v1\n"
+ "vse8.v v2, (%[wp0])\n"
+ "vse8.v v3, (%[wq0])\n"
+
+ "vle8.v v6, (%[wp1])\n"
+ "vle8.v v7, (%[wq1])\n"
+ "vxor.vv v6, v6, v4\n"
+ "vxor.vv v7, v7, v5\n"
+ "vse8.v v6, (%[wp1])\n"
+ "vse8.v v7, (%[wq1])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0]),
+ [wp1]"r"(&p[d + nsize * 1]),
+ [wq1]"r"(&q[d + nsize * 1])
+ );
+ }
+}
+
+static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0 + 1]; /* XOR parity */
+ q = dptr[z0 + 2]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /*
+ * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+ * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+ * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+ * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+ */
+ for (d = 0; d < bytes; d += nsize * 4) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ "vle8.v v4, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
+ "vle8.v v8, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
+ "vle8.v v12, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+ [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+ [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+ [wp3]"r"(&dptr[z0][d + 3 * nsize])
+ );
+
+ for (z = z0 - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v7, v7, v6\n"
+ "vle8.v v6, (%[wd1])\n"
+ "vxor.vv v5, v7, v6\n"
+ "vxor.vv v4, v4, v6\n"
+
+ "vsra.vi v10, v9, 7\n"
+ "vsll.vi v11, v9, 1\n"
+ "vand.vx v10, v10, %[x1d]\n"
+ "vxor.vv v11, v11, v10\n"
+ "vle8.v v10, (%[wd2])\n"
+ "vxor.vv v9, v11, v10\n"
+ "vxor.vv v8, v8, v10\n"
+
+ "vsra.vi v14, v13, 7\n"
+ "vsll.vi v15, v13, 1\n"
+ "vand.vx v14, v14, %[x1d]\n"
+ "vxor.vv v15, v15, v14\n"
+ "vle8.v v14, (%[wd3])\n"
+ "vxor.vv v13, v15, v14\n"
+ "vxor.vv v12, v12, v14\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [wd1]"r"(&dptr[z][d + 1 * nsize]),
+ [wd2]"r"(&dptr[z][d + 2 * nsize]),
+ [wd3]"r"(&dptr[z][d + 3 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vse8.v v0, (%[wp0])\n"
+ "vse8.v v1, (%[wq0])\n"
+ "vse8.v v4, (%[wp1])\n"
+ "vse8.v v5, (%[wq1])\n"
+ "vse8.v v8, (%[wp2])\n"
+ "vse8.v v9, (%[wq2])\n"
+ "vse8.v v12, (%[wp3])\n"
+ "vse8.v v13, (%[wq3])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0]),
+ [wp1]"r"(&p[d + nsize * 1]),
+ [wq1]"r"(&q[d + nsize * 1]),
+ [wp2]"r"(&p[d + nsize * 2]),
+ [wq2]"r"(&q[d + nsize * 2]),
+ [wp3]"r"(&p[d + nsize * 3]),
+ [wq3]"r"(&q[d + nsize * 3])
+ );
+ }
+}
+
+static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks - 2]; /* XOR parity */
+ q = dptr[disks - 1]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /*
+ * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+ * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+ * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+ * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+ */
+ for (d = 0; d < bytes; d += nsize * 4) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ "vle8.v v4, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
+ "vle8.v v8, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
+ "vle8.v v12, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+ [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+ [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+ [wp3]"r"(&dptr[z0][d + 3 * nsize])
+ );
+
+ /* P/Q data pages */
+ for (z = z0 - 1; z >= start; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v7, v7, v6\n"
+ "vle8.v v6, (%[wd1])\n"
+ "vxor.vv v5, v7, v6\n"
+ "vxor.vv v4, v4, v6\n"
+
+ "vsra.vi v10, v9, 7\n"
+ "vsll.vi v11, v9, 1\n"
+ "vand.vx v10, v10, %[x1d]\n"
+ "vxor.vv v11, v11, v10\n"
+ "vle8.v v10, (%[wd2])\n"
+ "vxor.vv v9, v11, v10\n"
+ "vxor.vv v8, v8, v10\n"
+
+ "vsra.vi v14, v13, 7\n"
+ "vsll.vi v15, v13, 1\n"
+ "vand.vx v14, v14, %[x1d]\n"
+ "vxor.vv v15, v15, v14\n"
+ "vle8.v v14, (%[wd3])\n"
+ "vxor.vv v13, v15, v14\n"
+ "vxor.vv v12, v12, v14\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [wd1]"r"(&dptr[z][d + 1 * nsize]),
+ [wd2]"r"(&dptr[z][d + 2 * nsize]),
+ [wd3]"r"(&dptr[z][d + 3 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /* P/Q left side optimization */
+ for (z = start - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * wq$$ = w1$$ ^ w2$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v1, v3, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v5, v7, v6\n"
+
+ "vsra.vi v10, v9, 7\n"
+ "vsll.vi v11, v9, 1\n"
+ "vand.vx v10, v10, %[x1d]\n"
+ "vxor.vv v9, v11, v10\n"
+
+ "vsra.vi v14, v13, 7\n"
+ "vsll.vi v15, v13, 1\n"
+ "vand.vx v14, v14, %[x1d]\n"
+ "vxor.vv v13, v15, v14\n"
+ ".option pop\n"
+ : :
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ * v0:wp0, v1:wq0, v2:p0, v3:q0
+ * v4:wp1, v5:wq1, v6:p1, v7:q1
+ * v8:wp2, v9:wq2, v10:p2, v11:q2
+ * v12:wp3, v13:wq3, v14:p3, v15:q3
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v2, (%[wp0])\n"
+ "vle8.v v3, (%[wq0])\n"
+ "vxor.vv v2, v2, v0\n"
+ "vxor.vv v3, v3, v1\n"
+ "vse8.v v2, (%[wp0])\n"
+ "vse8.v v3, (%[wq0])\n"
+
+ "vle8.v v6, (%[wp1])\n"
+ "vle8.v v7, (%[wq1])\n"
+ "vxor.vv v6, v6, v4\n"
+ "vxor.vv v7, v7, v5\n"
+ "vse8.v v6, (%[wp1])\n"
+ "vse8.v v7, (%[wq1])\n"
+
+ "vle8.v v10, (%[wp2])\n"
+ "vle8.v v11, (%[wq2])\n"
+ "vxor.vv v10, v10, v8\n"
+ "vxor.vv v11, v11, v9\n"
+ "vse8.v v10, (%[wp2])\n"
+ "vse8.v v11, (%[wq2])\n"
+
+ "vle8.v v14, (%[wp3])\n"
+ "vle8.v v15, (%[wq3])\n"
+ "vxor.vv v14, v14, v12\n"
+ "vxor.vv v15, v15, v13\n"
+ "vse8.v v14, (%[wp3])\n"
+ "vse8.v v15, (%[wq3])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0]),
+ [wp1]"r"(&p[d + nsize * 1]),
+ [wq1]"r"(&q[d + nsize * 1]),
+ [wp2]"r"(&p[d + nsize * 2]),
+ [wq2]"r"(&q[d + nsize * 2]),
+ [wp3]"r"(&p[d + nsize * 3]),
+ [wq3]"r"(&q[d + nsize * 3])
+ );
+ }
+}
+
+static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0 + 1]; /* XOR parity */
+ q = dptr[z0 + 2]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /*
+ * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+ * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+ * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+ * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+ * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
+ * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
+ * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
+ * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
+ */
+ for (d = 0; d < bytes; d += nsize * 8) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ "vle8.v v4, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
+ "vle8.v v8, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
+ "vle8.v v12, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
+ "vle8.v v16, (%[wp4])\n"
+ "vmv.v.v v17, v16\n"
+ "vle8.v v20, (%[wp5])\n"
+ "vmv.v.v v21, v20\n"
+ "vle8.v v24, (%[wp6])\n"
+ "vmv.v.v v25, v24\n"
+ "vle8.v v28, (%[wp7])\n"
+ "vmv.v.v v29, v28\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+ [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+ [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+ [wp3]"r"(&dptr[z0][d + 3 * nsize]),
+ [wp4]"r"(&dptr[z0][d + 4 * nsize]),
+ [wp5]"r"(&dptr[z0][d + 5 * nsize]),
+ [wp6]"r"(&dptr[z0][d + 6 * nsize]),
+ [wp7]"r"(&dptr[z0][d + 7 * nsize])
+ );
+
+ for (z = z0 - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v7, v7, v6\n"
+ "vle8.v v6, (%[wd1])\n"
+ "vxor.vv v5, v7, v6\n"
+ "vxor.vv v4, v4, v6\n"
+
+ "vsra.vi v10, v9, 7\n"
+ "vsll.vi v11, v9, 1\n"
+ "vand.vx v10, v10, %[x1d]\n"
+ "vxor.vv v11, v11, v10\n"
+ "vle8.v v10, (%[wd2])\n"
+ "vxor.vv v9, v11, v10\n"
+ "vxor.vv v8, v8, v10\n"
+
+ "vsra.vi v14, v13, 7\n"
+ "vsll.vi v15, v13, 1\n"
+ "vand.vx v14, v14, %[x1d]\n"
+ "vxor.vv v15, v15, v14\n"
+ "vle8.v v14, (%[wd3])\n"
+ "vxor.vv v13, v15, v14\n"
+ "vxor.vv v12, v12, v14\n"
+
+ "vsra.vi v18, v17, 7\n"
+ "vsll.vi v19, v17, 1\n"
+ "vand.vx v18, v18, %[x1d]\n"
+ "vxor.vv v19, v19, v18\n"
+ "vle8.v v18, (%[wd4])\n"
+ "vxor.vv v17, v19, v18\n"
+ "vxor.vv v16, v16, v18\n"
+
+ "vsra.vi v22, v21, 7\n"
+ "vsll.vi v23, v21, 1\n"
+ "vand.vx v22, v22, %[x1d]\n"
+ "vxor.vv v23, v23, v22\n"
+ "vle8.v v22, (%[wd5])\n"
+ "vxor.vv v21, v23, v22\n"
+ "vxor.vv v20, v20, v22\n"
+
+ "vsra.vi v26, v25, 7\n"
+ "vsll.vi v27, v25, 1\n"
+ "vand.vx v26, v26, %[x1d]\n"
+ "vxor.vv v27, v27, v26\n"
+ "vle8.v v26, (%[wd6])\n"
+ "vxor.vv v25, v27, v26\n"
+ "vxor.vv v24, v24, v26\n"
+
+ "vsra.vi v30, v29, 7\n"
+ "vsll.vi v31, v29, 1\n"
+ "vand.vx v30, v30, %[x1d]\n"
+ "vxor.vv v31, v31, v30\n"
+ "vle8.v v30, (%[wd7])\n"
+ "vxor.vv v29, v31, v30\n"
+ "vxor.vv v28, v28, v30\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [wd1]"r"(&dptr[z][d + 1 * nsize]),
+ [wd2]"r"(&dptr[z][d + 2 * nsize]),
+ [wd3]"r"(&dptr[z][d + 3 * nsize]),
+ [wd4]"r"(&dptr[z][d + 4 * nsize]),
+ [wd5]"r"(&dptr[z][d + 5 * nsize]),
+ [wd6]"r"(&dptr[z][d + 6 * nsize]),
+ [wd7]"r"(&dptr[z][d + 7 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] = wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] = wq$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vse8.v v0, (%[wp0])\n"
+ "vse8.v v1, (%[wq0])\n"
+ "vse8.v v4, (%[wp1])\n"
+ "vse8.v v5, (%[wq1])\n"
+ "vse8.v v8, (%[wp2])\n"
+ "vse8.v v9, (%[wq2])\n"
+ "vse8.v v12, (%[wp3])\n"
+ "vse8.v v13, (%[wq3])\n"
+ "vse8.v v16, (%[wp4])\n"
+ "vse8.v v17, (%[wq4])\n"
+ "vse8.v v20, (%[wp5])\n"
+ "vse8.v v21, (%[wq5])\n"
+ "vse8.v v24, (%[wp6])\n"
+ "vse8.v v25, (%[wq6])\n"
+ "vse8.v v28, (%[wp7])\n"
+ "vse8.v v29, (%[wq7])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0]),
+ [wp1]"r"(&p[d + nsize * 1]),
+ [wq1]"r"(&q[d + nsize * 1]),
+ [wp2]"r"(&p[d + nsize * 2]),
+ [wq2]"r"(&q[d + nsize * 2]),
+ [wp3]"r"(&p[d + nsize * 3]),
+ [wq3]"r"(&q[d + nsize * 3]),
+ [wp4]"r"(&p[d + nsize * 4]),
+ [wq4]"r"(&q[d + nsize * 4]),
+ [wp5]"r"(&p[d + nsize * 5]),
+ [wq5]"r"(&q[d + nsize * 5]),
+ [wp6]"r"(&p[d + nsize * 6]),
+ [wq6]"r"(&q[d + nsize * 6]),
+ [wp7]"r"(&p[d + nsize * 7]),
+ [wq7]"r"(&q[d + nsize * 7])
+ );
+ }
+}
+
+static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop,
+ unsigned long bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ unsigned long vl, d, nsize;
+ int z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks - 2]; /* XOR parity */
+ q = dptr[disks - 1]; /* RS syndrome */
+
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsetvli %0, x0, e8, m1, ta, ma\n"
+ ".option pop\n"
+ : "=&r" (vl)
+ );
+
+ nsize = vl;
+
+ /*
+ * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10
+ * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11
+ * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12
+ * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13
+ * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14
+ * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15
+ * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16
+ * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17
+ */
+ for (d = 0; d < bytes; d += nsize * 8) {
+ /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v0, (%[wp0])\n"
+ "vmv.v.v v1, v0\n"
+ "vle8.v v4, (%[wp1])\n"
+ "vmv.v.v v5, v4\n"
+ "vle8.v v8, (%[wp2])\n"
+ "vmv.v.v v9, v8\n"
+ "vle8.v v12, (%[wp3])\n"
+ "vmv.v.v v13, v12\n"
+ "vle8.v v16, (%[wp4])\n"
+ "vmv.v.v v17, v16\n"
+ "vle8.v v20, (%[wp5])\n"
+ "vmv.v.v v21, v20\n"
+ "vle8.v v24, (%[wp6])\n"
+ "vmv.v.v v25, v24\n"
+ "vle8.v v28, (%[wp7])\n"
+ "vmv.v.v v29, v28\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&dptr[z0][d + 0 * nsize]),
+ [wp1]"r"(&dptr[z0][d + 1 * nsize]),
+ [wp2]"r"(&dptr[z0][d + 2 * nsize]),
+ [wp3]"r"(&dptr[z0][d + 3 * nsize]),
+ [wp4]"r"(&dptr[z0][d + 4 * nsize]),
+ [wp5]"r"(&dptr[z0][d + 5 * nsize]),
+ [wp6]"r"(&dptr[z0][d + 6 * nsize]),
+ [wp7]"r"(&dptr[z0][d + 7 * nsize])
+ );
+
+ /* P/Q data pages */
+ for (z = z0 - 1; z >= start; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * w1$$ ^= w2$$;
+ * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
+ * wq$$ = w1$$ ^ wd$$;
+ * wp$$ ^= wd$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v3, v3, v2\n"
+ "vle8.v v2, (%[wd0])\n"
+ "vxor.vv v1, v3, v2\n"
+ "vxor.vv v0, v0, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v7, v7, v6\n"
+ "vle8.v v6, (%[wd1])\n"
+ "vxor.vv v5, v7, v6\n"
+ "vxor.vv v4, v4, v6\n"
+
+ "vsra.vi v10, v9, 7\n"
+ "vsll.vi v11, v9, 1\n"
+ "vand.vx v10, v10, %[x1d]\n"
+ "vxor.vv v11, v11, v10\n"
+ "vle8.v v10, (%[wd2])\n"
+ "vxor.vv v9, v11, v10\n"
+ "vxor.vv v8, v8, v10\n"
+
+ "vsra.vi v14, v13, 7\n"
+ "vsll.vi v15, v13, 1\n"
+ "vand.vx v14, v14, %[x1d]\n"
+ "vxor.vv v15, v15, v14\n"
+ "vle8.v v14, (%[wd3])\n"
+ "vxor.vv v13, v15, v14\n"
+ "vxor.vv v12, v12, v14\n"
+
+ "vsra.vi v18, v17, 7\n"
+ "vsll.vi v19, v17, 1\n"
+ "vand.vx v18, v18, %[x1d]\n"
+ "vxor.vv v19, v19, v18\n"
+ "vle8.v v18, (%[wd4])\n"
+ "vxor.vv v17, v19, v18\n"
+ "vxor.vv v16, v16, v18\n"
+
+ "vsra.vi v22, v21, 7\n"
+ "vsll.vi v23, v21, 1\n"
+ "vand.vx v22, v22, %[x1d]\n"
+ "vxor.vv v23, v23, v22\n"
+ "vle8.v v22, (%[wd5])\n"
+ "vxor.vv v21, v23, v22\n"
+ "vxor.vv v20, v20, v22\n"
+
+ "vsra.vi v26, v25, 7\n"
+ "vsll.vi v27, v25, 1\n"
+ "vand.vx v26, v26, %[x1d]\n"
+ "vxor.vv v27, v27, v26\n"
+ "vle8.v v26, (%[wd6])\n"
+ "vxor.vv v25, v27, v26\n"
+ "vxor.vv v24, v24, v26\n"
+
+ "vsra.vi v30, v29, 7\n"
+ "vsll.vi v31, v29, 1\n"
+ "vand.vx v30, v30, %[x1d]\n"
+ "vxor.vv v31, v31, v30\n"
+ "vle8.v v30, (%[wd7])\n"
+ "vxor.vv v29, v31, v30\n"
+ "vxor.vv v28, v28, v30\n"
+ ".option pop\n"
+ : :
+ [wd0]"r"(&dptr[z][d + 0 * nsize]),
+ [wd1]"r"(&dptr[z][d + 1 * nsize]),
+ [wd2]"r"(&dptr[z][d + 2 * nsize]),
+ [wd3]"r"(&dptr[z][d + 3 * nsize]),
+ [wd4]"r"(&dptr[z][d + 4 * nsize]),
+ [wd5]"r"(&dptr[z][d + 5 * nsize]),
+ [wd6]"r"(&dptr[z][d + 6 * nsize]),
+ [wd7]"r"(&dptr[z][d + 7 * nsize]),
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /* P/Q left side optimization */
+ for (z = start - 1; z >= 0; z--) {
+ /*
+ * w2$$ = MASK(wq$$);
+ * w1$$ = SHLBYTE(wq$$);
+ * w2$$ &= NBYTES(0x1d);
+ * wq$$ = w1$$ ^ w2$$;
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vsra.vi v2, v1, 7\n"
+ "vsll.vi v3, v1, 1\n"
+ "vand.vx v2, v2, %[x1d]\n"
+ "vxor.vv v1, v3, v2\n"
+
+ "vsra.vi v6, v5, 7\n"
+ "vsll.vi v7, v5, 1\n"
+ "vand.vx v6, v6, %[x1d]\n"
+ "vxor.vv v5, v7, v6\n"
+
+ "vsra.vi v10, v9, 7\n"
+ "vsll.vi v11, v9, 1\n"
+ "vand.vx v10, v10, %[x1d]\n"
+ "vxor.vv v9, v11, v10\n"
+
+ "vsra.vi v14, v13, 7\n"
+ "vsll.vi v15, v13, 1\n"
+ "vand.vx v14, v14, %[x1d]\n"
+ "vxor.vv v13, v15, v14\n"
+
+ "vsra.vi v18, v17, 7\n"
+ "vsll.vi v19, v17, 1\n"
+ "vand.vx v18, v18, %[x1d]\n"
+ "vxor.vv v17, v19, v18\n"
+
+ "vsra.vi v22, v21, 7\n"
+ "vsll.vi v23, v21, 1\n"
+ "vand.vx v22, v22, %[x1d]\n"
+ "vxor.vv v21, v23, v22\n"
+
+ "vsra.vi v26, v25, 7\n"
+ "vsll.vi v27, v25, 1\n"
+ "vand.vx v26, v26, %[x1d]\n"
+ "vxor.vv v25, v27, v26\n"
+
+ "vsra.vi v30, v29, 7\n"
+ "vsll.vi v31, v29, 1\n"
+ "vand.vx v30, v30, %[x1d]\n"
+ "vxor.vv v29, v31, v30\n"
+ ".option pop\n"
+ : :
+ [x1d]"r"(0x1d)
+ );
+ }
+
+ /*
+ * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
+ * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
+ * v0:wp0, v1:wq0, v2:p0, v3:q0
+ * v4:wp1, v5:wq1, v6:p1, v7:q1
+ * v8:wp2, v9:wq2, v10:p2, v11:q2
+ * v12:wp3, v13:wq3, v14:p3, v15:q3
+ * v16:wp4, v17:wq4, v18:p4, v19:q4
+ * v20:wp5, v21:wq5, v22:p5, v23:q5
+ * v24:wp6, v25:wq6, v26:p6, v27:q6
+ * v28:wp7, v29:wq7, v30:p7, v31:q7
+ */
+ asm volatile (".option push\n"
+ ".option arch,+v\n"
+ "vle8.v v2, (%[wp0])\n"
+ "vle8.v v3, (%[wq0])\n"
+ "vxor.vv v2, v2, v0\n"
+ "vxor.vv v3, v3, v1\n"
+ "vse8.v v2, (%[wp0])\n"
+ "vse8.v v3, (%[wq0])\n"
+
+ "vle8.v v6, (%[wp1])\n"
+ "vle8.v v7, (%[wq1])\n"
+ "vxor.vv v6, v6, v4\n"
+ "vxor.vv v7, v7, v5\n"
+ "vse8.v v6, (%[wp1])\n"
+ "vse8.v v7, (%[wq1])\n"
+
+ "vle8.v v10, (%[wp2])\n"
+ "vle8.v v11, (%[wq2])\n"
+ "vxor.vv v10, v10, v8\n"
+ "vxor.vv v11, v11, v9\n"
+ "vse8.v v10, (%[wp2])\n"
+ "vse8.v v11, (%[wq2])\n"
+
+ "vle8.v v14, (%[wp3])\n"
+ "vle8.v v15, (%[wq3])\n"
+ "vxor.vv v14, v14, v12\n"
+ "vxor.vv v15, v15, v13\n"
+ "vse8.v v14, (%[wp3])\n"
+ "vse8.v v15, (%[wq3])\n"
+
+ "vle8.v v18, (%[wp4])\n"
+ "vle8.v v19, (%[wq4])\n"
+ "vxor.vv v18, v18, v16\n"
+ "vxor.vv v19, v19, v17\n"
+ "vse8.v v18, (%[wp4])\n"
+ "vse8.v v19, (%[wq4])\n"
+
+ "vle8.v v22, (%[wp5])\n"
+ "vle8.v v23, (%[wq5])\n"
+ "vxor.vv v22, v22, v20\n"
+ "vxor.vv v23, v23, v21\n"
+ "vse8.v v22, (%[wp5])\n"
+ "vse8.v v23, (%[wq5])\n"
+
+ "vle8.v v26, (%[wp6])\n"
+ "vle8.v v27, (%[wq6])\n"
+ "vxor.vv v26, v26, v24\n"
+ "vxor.vv v27, v27, v25\n"
+ "vse8.v v26, (%[wp6])\n"
+ "vse8.v v27, (%[wq6])\n"
+
+ "vle8.v v30, (%[wp7])\n"
+ "vle8.v v31, (%[wq7])\n"
+ "vxor.vv v30, v30, v28\n"
+ "vxor.vv v31, v31, v29\n"
+ "vse8.v v30, (%[wp7])\n"
+ "vse8.v v31, (%[wq7])\n"
+ ".option pop\n"
+ : :
+ [wp0]"r"(&p[d + nsize * 0]),
+ [wq0]"r"(&q[d + nsize * 0]),
+ [wp1]"r"(&p[d + nsize * 1]),
+ [wq1]"r"(&q[d + nsize * 1]),
+ [wp2]"r"(&p[d + nsize * 2]),
+ [wq2]"r"(&q[d + nsize * 2]),
+ [wp3]"r"(&p[d + nsize * 3]),
+ [wq3]"r"(&q[d + nsize * 3]),
+ [wp4]"r"(&p[d + nsize * 4]),
+ [wq4]"r"(&q[d + nsize * 4]),
+ [wp5]"r"(&p[d + nsize * 5]),
+ [wq5]"r"(&q[d + nsize * 5]),
+ [wp6]"r"(&p[d + nsize * 6]),
+ [wq6]"r"(&q[d + nsize * 6]),
+ [wp7]"r"(&p[d + nsize * 7]),
+ [wq7]"r"(&q[d + nsize * 7])
+ );
+ }
+}
+
+RAID6_RVV_WRAPPER(1);
+RAID6_RVV_WRAPPER(2);
+RAID6_RVV_WRAPPER(4);
+RAID6_RVV_WRAPPER(8);
diff --git a/lib/raid/raid6/riscv/rvv.h b/lib/raid/raid6/riscv/rvv.h
new file mode 100644
index 000000000000..b0a71b375962
--- /dev/null
+++ b/lib/raid/raid6/riscv/rvv.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2024 Institute of Software, CAS.
+ *
+ * raid6/rvv.h
+ *
+ * Definitions for RISC-V RAID-6 code
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/vector.h>
+
+static int rvv_has_vector(void)
+{
+ return has_vector();
+}
+
+#define RAID6_RVV_WRAPPER(_n) \
+ static void raid6_rvv ## _n ## _gen_syndrome(int disks, \
+ size_t bytes, void **ptrs) \
+ { \
+ void raid6_rvv ## _n ## _gen_syndrome_real(int d, \
+ unsigned long b, void **p); \
+ kernel_vector_begin(); \
+ raid6_rvv ## _n ## _gen_syndrome_real(disks, \
+ (unsigned long)bytes, ptrs); \
+ kernel_vector_end(); \
+ } \
+ static void raid6_rvv ## _n ## _xor_syndrome(int disks, \
+ int start, int stop, \
+ size_t bytes, void **ptrs) \
+ { \
+ void raid6_rvv ## _n ## _xor_syndrome_real(int d, \
+ int s1, int s2, \
+ unsigned long b, void **p); \
+ kernel_vector_begin(); \
+ raid6_rvv ## _n ## _xor_syndrome_real(disks, \
+ start, stop, (unsigned long)bytes, ptrs); \
+ kernel_vector_end(); \
+ } \
+ struct raid6_calls const raid6_rvvx ## _n = { \
+ raid6_rvv ## _n ## _gen_syndrome, \
+ raid6_rvv ## _n ## _xor_syndrome, \
+ rvv_has_vector, \
+ "rvvx" #_n, \
+ 0 \
+ }
diff --git a/lib/raid/raid6/s390/recov_s390xc.c b/lib/raid/raid6/s390/recov_s390xc.c
new file mode 100644
index 000000000000..487018f81192
--- /dev/null
+++ b/lib/raid/raid6/s390/recov_s390xc.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RAID-6 data recovery in dual failure mode based on the XC instruction.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/raid/pq.h>
+
+static inline void xor_block(u8 *p1, u8 *p2)
+{
+ typedef struct { u8 _[256]; } addrtype;
+
+ asm volatile(
+ " xc 0(256,%[p1]),0(%[p2])\n"
+ : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2),
+ [p1] "a" (p1), [p2] "a" (p2) : "cc");
+}
+
+/* Recover two failed data blocks. */
+static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ int i;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+ /* Now do it... */
+ while (bytes) {
+ xor_block(dp, p);
+ xor_block(dq, q);
+ for (i = 0; i < 256; i++)
+ dq[i] = pbmul[dp[i]] ^ qmul[dq[i]];
+ xor_block(dp, dq);
+ p += 256;
+ q += 256;
+ dp += 256;
+ dq += 256;
+ bytes -= 256;
+ }
+}
+
+/* Recover failure of one data block plus the P block */
+static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ int i;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ /* Now do it... */
+ while (bytes) {
+ xor_block(dq, q);
+ for (i = 0; i < 256; i++)
+ dq[i] = qmul[dq[i]];
+ xor_block(p, dq);
+ p += 256;
+ q += 256;
+ dq += 256;
+ bytes -= 256;
+ }
+}
+
+
+const struct raid6_recov_calls raid6_recov_s390xc = {
+ .data2 = raid6_2data_recov_s390xc,
+ .datap = raid6_datap_recov_s390xc,
+ .valid = NULL,
+ .name = "s390xc",
+ .priority = 1,
+};
diff --git a/lib/raid/raid6/s390/s390vx.uc b/lib/raid/raid6/s390/s390vx.uc
new file mode 100644
index 000000000000..8aa53eb2f395
--- /dev/null
+++ b/lib/raid/raid6/s390/s390vx.uc
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * raid6_vx$#.c
+ *
+ * $#-way unrolled RAID6 gen/xor functions for s390
+ * based on the vector facility
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *
+ * This file is postprocessed using unroll.awk.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/raid/pq.h>
+#include <asm/fpu.h>
+
+#define NSIZE 16
+
+static __always_inline void LOAD_CONST(void)
+{
+ fpu_vrepib(24, 0x07);
+ fpu_vrepib(25, 0x1d);
+}
+
+/*
+ * The SHLBYTE() operation shifts each of the 16 bytes in
+ * vector register y left by 1 bit and stores the result in
+ * vector register x.
+ */
+#define SHLBYTE(x, y) fpu_vab(x, y, y)
+
+/*
+ * For each of the 16 bytes in the vector register y the MASK()
+ * operation returns 0xFF if the high bit of the byte is 1,
+ * or 0x00 if the high bit is 0. The result is stored in vector
+ * register x.
+ */
+#define MASK(x, y) fpu_vesravb(x, y, 24)
+
+#define AND(x, y, z) fpu_vn(x, y, z)
+#define XOR(x, y, z) fpu_vx(x, y, z)
+#define LOAD_DATA(x, ptr) fpu_vlm(x, x + $# - 1, ptr)
+#define STORE_DATA(x, ptr) fpu_vstm(x, x + $# - 1, ptr)
+#define COPY_VEC(x, y) fpu_vlr(x, y)
+
+static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+ u8 **dptr, *p, *q;
+ int d, z, z0;
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ LOAD_CONST();
+
+ dptr = (u8 **) ptrs;
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0 + 1]; /* XOR parity */
+ q = dptr[z0 + 2]; /* RS syndrome */
+
+ for (d = 0; d < bytes; d += $#*NSIZE) {
+ LOAD_DATA(0,&dptr[z0][d]);
+ COPY_VEC(8+$$,0+$$);
+ for (z = z0 - 1; z >= 0; z--) {
+ MASK(16+$$,8+$$);
+ AND(16+$$,16+$$,25);
+ SHLBYTE(8+$$,8+$$);
+ XOR(8+$$,8+$$,16+$$);
+ LOAD_DATA(16,&dptr[z][d]);
+ XOR(0+$$,0+$$,16+$$);
+ XOR(8+$$,8+$$,16+$$);
+ }
+ STORE_DATA(0,&p[d]);
+ STORE_DATA(8,&q[d]);
+ }
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+
+static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+ u8 **dptr, *p, *q;
+ int d, z, z0;
+
+ dptr = (u8 **) ptrs;
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks - 2]; /* XOR parity */
+ q = dptr[disks - 1]; /* RS syndrome */
+
+ kernel_fpu_begin(&vxstate, KERNEL_VXR);
+ LOAD_CONST();
+
+ for (d = 0; d < bytes; d += $#*NSIZE) {
+ /* P/Q data pages */
+ LOAD_DATA(0,&dptr[z0][d]);
+ COPY_VEC(8+$$,0+$$);
+ for (z = z0 - 1; z >= start; z--) {
+ MASK(16+$$,8+$$);
+ AND(16+$$,16+$$,25);
+ SHLBYTE(8+$$,8+$$);
+ XOR(8+$$,8+$$,16+$$);
+ LOAD_DATA(16,&dptr[z][d]);
+ XOR(0+$$,0+$$,16+$$);
+ XOR(8+$$,8+$$,16+$$);
+ }
+ /* P/Q left side optimization */
+ for (z = start - 1; z >= 0; z--) {
+ MASK(16+$$,8+$$);
+ AND(16+$$,16+$$,25);
+ SHLBYTE(8+$$,8+$$);
+ XOR(8+$$,8+$$,16+$$);
+ }
+ LOAD_DATA(16,&p[d]);
+ XOR(16+$$,16+$$,0+$$);
+ STORE_DATA(16,&p[d]);
+ LOAD_DATA(16,&q[d]);
+ XOR(16+$$,16+$$,8+$$);
+ STORE_DATA(16,&q[d]);
+ }
+ kernel_fpu_end(&vxstate, KERNEL_VXR);
+}
+
+static int raid6_s390vx$#_valid(void)
+{
+ return cpu_has_vx();
+}
+
+const struct raid6_calls raid6_s390vx$# = {
+ raid6_s390vx$#_gen_syndrome,
+ raid6_s390vx$#_xor_syndrome,
+ raid6_s390vx$#_valid,
+ "vx128x$#",
+ 1
+};
diff --git a/lib/raid/raid6/tests/Makefile b/lib/raid/raid6/tests/Makefile
new file mode 100644
index 000000000000..87a001b22847
--- /dev/null
+++ b/lib/raid/raid6/tests/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_RAID6_PQ_KUNIT_TEST) += raid6_kunit.o
diff --git a/lib/raid/raid6/tests/raid6_kunit.c b/lib/raid/raid6/tests/raid6_kunit.c
new file mode 100644
index 000000000000..9db287b4a48f
--- /dev/null
+++ b/lib/raid/raid6/tests/raid6_kunit.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
+ *
+ * Test RAID-6 recovery algorithms.
+ */
+
+#include <kunit/test.h>
+#include <linux/prandom.h>
+#include <linux/raid/pq.h>
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+#define RAID6_KUNIT_SEED 42
+
+#define NDISKS 16 /* Including P and Q */
+
+static struct rnd_state rng;
+static void *dataptrs[NDISKS];
+static char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+static char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+static char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
+
+static void makedata(int start, int stop)
+{
+ int i;
+
+ for (i = start; i <= stop; i++) {
+ prandom_bytes_state(&rng, data[i], PAGE_SIZE);
+ dataptrs[i] = data[i];
+ }
+}
+
+static char member_type(int d)
+{
+ switch (d) {
+ case NDISKS-2:
+ return 'P';
+ case NDISKS-1:
+ return 'Q';
+ default:
+ return 'D';
+ }
+}
+
+static void test_disks(struct kunit *test, const struct raid6_calls *calls,
+ const struct raid6_recov_calls *ra, int faila, int failb)
+{
+ memset(recovi, 0xf0, PAGE_SIZE);
+ memset(recovj, 0xba, PAGE_SIZE);
+
+ dataptrs[faila] = recovi;
+ dataptrs[failb] = recovj;
+
+ if (failb == NDISKS - 1) {
+ /*
+ * We don't implement the data+Q failure scenario, since it
+ * is equivalent to a RAID-5 failure (XOR, then recompute Q).
+ */
+ if (faila != NDISKS - 2)
+ goto skip;
+
+ /* P+Q failure. Just rebuild the syndrome. */
+ calls->gen_syndrome(NDISKS, PAGE_SIZE, dataptrs);
+ } else if (failb == NDISKS - 2) {
+ /* data+P failure. */
+ ra->datap(NDISKS, PAGE_SIZE, faila, dataptrs);
+ } else {
+ /* data+data failure. */
+ ra->data2(NDISKS, PAGE_SIZE, faila, failb, dataptrs);
+ }
+
+ KUNIT_EXPECT_MEMEQ_MSG(test, data[faila], recovi, PAGE_SIZE,
+ "algo=%-8s/%-8s faila miscompared: %3d[%c] (failb=%3d[%c])\n",
+ calls->name, ra->name,
+ faila, member_type(faila),
+ failb, member_type(failb));
+ KUNIT_EXPECT_MEMEQ_MSG(test, data[failb], recovj, PAGE_SIZE,
+ "algo=%-8s/%-8s failb miscompared: %3d[%c] (faila=%3d[%c])\n",
+ calls->name, ra->name,
+ failb, member_type(failb),
+ faila, member_type(faila));
+
+skip:
+ dataptrs[faila] = data[faila];
+ dataptrs[failb] = data[failb];
+}
+
+static void raid6_test(struct kunit *test)
+{
+ const struct raid6_calls *const *algo;
+ const struct raid6_recov_calls *const *ra;
+ int i, j, p1, p2;
+
+ for (ra = raid6_recov_algos; *ra; ra++) {
+ if ((*ra)->valid && !(*ra)->valid())
+ continue;
+
+ for (algo = raid6_algos; *algo; algo++) {
+ const struct raid6_calls *calls = *algo;
+
+ if (calls->valid && !calls->valid())
+ continue;
+
+ /* Nuke syndromes */
+ memset(data[NDISKS - 2], 0xee, PAGE_SIZE);
+ memset(data[NDISKS - 1], 0xee, PAGE_SIZE);
+
+ /* Generate assumed good syndrome */
+ calls->gen_syndrome(NDISKS, PAGE_SIZE,
+ (void **)&dataptrs);
+
+ for (i = 0; i < NDISKS-1; i++)
+ for (j = i+1; j < NDISKS; j++)
+ test_disks(test, calls, *ra, i, j);
+
+ if (!calls->xor_syndrome)
+ continue;
+
+ for (p1 = 0; p1 < NDISKS-2; p1++)
+ for (p2 = p1; p2 < NDISKS-2; p2++) {
+
+ /* Simulate rmw run */
+ calls->xor_syndrome(NDISKS, p1, p2, PAGE_SIZE,
+ (void **)&dataptrs);
+ makedata(p1, p2);
+ calls->xor_syndrome(NDISKS, p1, p2, PAGE_SIZE,
+ (void **)&dataptrs);
+
+ for (i = 0; i < NDISKS-1; i++)
+ for (j = i+1; j < NDISKS; j++)
+ test_disks(test, calls,
+ *ra, i, j);
+ }
+
+ }
+ }
+}
+
+static struct kunit_case raid6_test_cases[] = {
+ KUNIT_CASE(raid6_test),
+ {},
+};
+
+static int raid6_suite_init(struct kunit_suite *suite)
+{
+ prandom_seed_state(&rng, RAID6_KUNIT_SEED);
+ makedata(0, NDISKS - 1);
+ return 0;
+}
+
+static struct kunit_suite raid6_test_suite = {
+ .name = "raid6",
+ .test_cases = raid6_test_cases,
+ .suite_init = raid6_suite_init,
+};
+kunit_test_suite(raid6_test_suite);
+
+MODULE_DESCRIPTION("Unit test for the RAID P/Q library functions");
+MODULE_LICENSE("GPL");
diff --git a/lib/raid/raid6/unroll.awk b/lib/raid/raid6/unroll.awk
new file mode 100644
index 000000000000..0809805a7e23
--- /dev/null
+++ b/lib/raid/raid6/unroll.awk
@@ -0,0 +1,20 @@
+
+# This filter requires one command line option of form -vN=n
+# where n must be a decimal number.
+#
+# Repeat each input line containing $$ n times, replacing $$ with 0...n-1.
+# Replace each $# with n, and each $* with a single $.
+
+BEGIN {
+ n = N + 0
+}
+{
+ if (/\$\$/) { rep = n } else { rep = 1 }
+ for (i = 0; i < rep; ++i) {
+ tmp = $0
+ gsub(/\$\$/, i, tmp)
+ gsub(/\$#/, n, tmp)
+ gsub(/\$\*/, "$", tmp)
+ print tmp
+ }
+}
diff --git a/lib/raid/raid6/x86/avx2.c b/lib/raid/raid6/x86/avx2.c
new file mode 100644
index 000000000000..a1a5213918af
--- /dev/null
+++ b/lib/raid/raid6/x86/avx2.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com>
+ *
+ * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * AVX2 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+static const struct raid6_avx2_constants {
+ u64 x1d[4];
+} raid6_avx2_constants __aligned(32) = {
+ { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx2(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX);
+}
+
+/*
+ * Plain AVX2 implementation
+ */
+static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+ asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */
+
+ for (d = 0; d < bytes; d += 32) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+ asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */
+ asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d]));
+ for (z = z0-2; z >= 0; z--) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm6,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm6,%ymm4,%ymm4");
+ asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d]));
+ }
+ asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm6,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm6,%ymm4,%ymm4");
+
+ asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vpxor %ymm2,%ymm2,%ymm2");
+ asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vpxor %ymm4,%ymm4,%ymm4");
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx21_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 32) {
+ asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+ asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ }
+ asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+ /* Don't use movntdq for r/w memory area < cache line */
+ asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx2x1 = {
+ raid6_avx21_gen_syndrome,
+ raid6_avx21_xor_syndrome,
+ raid6_have_avx2,
+ "avx2x1",
+ .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */
+};
+
+/*
+ * Unrolled-by-2 AVX2 implementation
+ */
+static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+ asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
+
+ /* We uniformly assume a single prefetch covers at least 32 bytes */
+ for (d = 0; d < bytes; d += 64) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */
+ asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */
+ asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */
+ asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */
+ for (z = z0-1; z >= 0; z--) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
+ asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
+ asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ }
+ asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+ asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx22_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 64) {
+ asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+ asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+ asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+ asm volatile("vmovdqa %0,%%ymm7"
+ :: "m" (dptr[z][d+32]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ }
+ asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+ asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+ /* Don't use movntdq for r/w memory area < cache line */
+ asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32]));
+ asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx2x2 = {
+ raid6_avx22_gen_syndrome,
+ raid6_avx22_xor_syndrome,
+ raid6_have_avx2,
+ "avx2x2",
+ .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0]));
+ asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */
+ asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */
+ asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */
+ asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */
+ asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */
+ asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */
+ asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */
+ asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */
+ asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */
+
+ for (d = 0; d < bytes; d += 128) {
+ for (z = z0; z >= 0; z--) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32]));
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64]));
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96]));
+ asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7");
+ asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13");
+ asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+ asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpand %ymm0,%ymm13,%ymm13");
+ asm volatile("vpand %ymm0,%ymm15,%ymm15");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d]));
+ asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32]));
+ asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64]));
+ asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm13,%ymm10,%ymm10");
+ asm volatile("vpxor %ymm15,%ymm11,%ymm11");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ }
+ asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vpxor %ymm2,%ymm2,%ymm2");
+ asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+ asm volatile("vpxor %ymm3,%ymm3,%ymm3");
+ asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
+ asm volatile("vpxor %ymm10,%ymm10,%ymm10");
+ asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
+ asm volatile("vpxor %ymm11,%ymm11,%ymm11");
+ asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vpxor %ymm4,%ymm4,%ymm4");
+ asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+ asm volatile("vpxor %ymm6,%ymm6,%ymm6");
+ asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
+ asm volatile("vpxor %ymm12,%ymm12,%ymm12");
+ asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
+ asm volatile("vpxor %ymm14,%ymm14,%ymm14");
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx24_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 128) {
+ asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d]));
+ asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32]));
+ asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64]));
+ asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96]));
+ asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d]));
+ asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32]));
+ asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64]));
+ asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96]));
+ asm volatile("vpxor %ymm4,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm6,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm12,%ymm10,%ymm10");
+ asm volatile("vpxor %ymm14,%ymm11,%ymm11");
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64]));
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+ asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+ asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+ asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpand %ymm0,%ymm13,%ymm13");
+ asm volatile("vpand %ymm0,%ymm15,%ymm15");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d]));
+ asm volatile("vmovdqa %0,%%ymm7"
+ :: "m" (dptr[z][d+32]));
+ asm volatile("vmovdqa %0,%%ymm13"
+ :: "m" (dptr[z][d+64]));
+ asm volatile("vmovdqa %0,%%ymm15"
+ :: "m" (dptr[z][d+96]));
+ asm volatile("vpxor %ymm5,%ymm2,%ymm2");
+ asm volatile("vpxor %ymm7,%ymm3,%ymm3");
+ asm volatile("vpxor %ymm13,%ymm10,%ymm10");
+ asm volatile("vpxor %ymm15,%ymm11,%ymm11");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ }
+ asm volatile("prefetchnta %0" :: "m" (q[d]));
+ asm volatile("prefetchnta %0" :: "m" (q[d+64]));
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxor %ymm5,%ymm5,%ymm5");
+ asm volatile("vpxor %ymm7,%ymm7,%ymm7");
+ asm volatile("vpxor %ymm13,%ymm13,%ymm13");
+ asm volatile("vpxor %ymm15,%ymm15,%ymm15");
+ asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5");
+ asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7");
+ asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13");
+ asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15");
+ asm volatile("vpaddb %ymm4,%ymm4,%ymm4");
+ asm volatile("vpaddb %ymm6,%ymm6,%ymm6");
+ asm volatile("vpaddb %ymm12,%ymm12,%ymm12");
+ asm volatile("vpaddb %ymm14,%ymm14,%ymm14");
+ asm volatile("vpand %ymm0,%ymm5,%ymm5");
+ asm volatile("vpand %ymm0,%ymm7,%ymm7");
+ asm volatile("vpand %ymm0,%ymm13,%ymm13");
+ asm volatile("vpand %ymm0,%ymm15,%ymm15");
+ asm volatile("vpxor %ymm5,%ymm4,%ymm4");
+ asm volatile("vpxor %ymm7,%ymm6,%ymm6");
+ asm volatile("vpxor %ymm13,%ymm12,%ymm12");
+ asm volatile("vpxor %ymm15,%ymm14,%ymm14");
+ }
+ asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d]));
+ asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32]));
+ asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64]));
+ asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96]));
+ asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d]));
+ asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32]));
+ asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64]));
+ asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96]));
+ asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d]));
+ asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32]));
+ asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64]));
+ asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96]));
+ }
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx2x4 = {
+ raid6_avx24_gen_syndrome,
+ raid6_avx24_xor_syndrome,
+ raid6_have_avx2,
+ "avx2x4",
+ .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */
+};
+#endif /* CONFIG_X86_64 */
diff --git a/lib/raid/raid6/x86/avx512.c b/lib/raid/raid6/x86/avx512.c
new file mode 100644
index 000000000000..874998bcd7d7
--- /dev/null
+++ b/lib/raid/raid6/x86/avx512.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- --------------------------------------------------------
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ *
+ * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
+ * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * -----------------------------------------------------------------------
+ */
+
+/*
+ * AVX512 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+static const struct raid6_avx512_constants {
+ u64 x1d[8];
+} raid6_avx512_constants __aligned(512/8) = {
+ { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
+ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
+};
+
+static int raid6_have_avx512(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+ "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+ :
+ : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0; d < bytes; d += 64) {
+ asm volatile("prefetchnta %0\n\t"
+ "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
+ "prefetchnta %1\n\t"
+ "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+ "vmovdqa64 %1,%%zmm6"
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
+ for (z = z0-2; z >= 0; z--) {
+ asm volatile("prefetchnta %0\n\t"
+ "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+ "vmovdqa64 %0,%%zmm6"
+ :
+ : "m" (dptr[z][d]));
+ }
+ asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
+ "vmovntdq %%zmm2,%0\n\t"
+ "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+ "vmovntdq %%zmm4,%1\n\t"
+ "vpxorq %%zmm4,%%zmm4,%%zmm4"
+ :
+ : "m" (p[d]), "m" (q[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx5121_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0"
+ : : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 64) {
+ asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+ "vmovdqa64 %1,%%zmm2\n\t"
+ "vpxorq %%zmm4,%%zmm2,%%zmm2"
+ :
+ : "m" (dptr[z0][d]), "m" (p[d]));
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4"
+ :
+ : "m" (dptr[z][d]));
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4"
+ :
+ : );
+ }
+ asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+ /* Don't use movntdq for r/w memory area < cache line */
+ "vmovdqa64 %%zmm4,%0\n\t"
+ "vmovdqa64 %%zmm2,%1"
+ :
+ : "m" (q[d]), "m" (p[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x1 = {
+ raid6_avx5121_gen_syndrome,
+ raid6_avx5121_xor_syndrome,
+ raid6_have_avx512,
+ "avx512x1",
+ .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
+};
+
+/*
+ * Unrolled-by-2 AVX512 implementation
+ */
+static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+ "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
+ :
+ : "m" (raid6_avx512_constants.x1d[0]));
+
+ /* We uniformly assume a single prefetch covers at least 64 bytes */
+ for (d = 0; d < bytes; d += 128) {
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */
+ "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */
+ "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
+ "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
+ for (z = z0-1; z >= 0; z--) {
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+ }
+ asm volatile("vmovntdq %%zmm2,%0\n\t"
+ "vmovntdq %%zmm3,%1\n\t"
+ "vmovntdq %%zmm4,%2\n\t"
+ "vmovntdq %%zmm6,%3"
+ :
+ : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
+ "m" (q[d+64]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx5122_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0"
+ : : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 128) {
+ asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+ "vmovdqa64 %1,%%zmm6\n\t"
+ "vmovdqa64 %2,%%zmm2\n\t"
+ "vmovdqa64 %3,%%zmm3\n\t"
+ "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm3,%%zmm3"
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+ "m" (p[d]), "m" (p[d+64]));
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
+ }
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6"
+ :
+ : );
+ }
+ asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t"
+ "vpxorq %1,%%zmm6,%%zmm6\n\t"
+ /* Don't use movntdq for r/w
+ * memory area < cache line
+ */
+ "vmovdqa64 %%zmm4,%0\n\t"
+ "vmovdqa64 %%zmm6,%1\n\t"
+ "vmovdqa64 %%zmm2,%2\n\t"
+ "vmovdqa64 %%zmm3,%3"
+ :
+ : "m" (q[d]), "m" (q[d+64]), "m" (p[d]),
+ "m" (p[d+64]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_avx512x2 = {
+ raid6_avx5122_gen_syndrome,
+ raid6_avx5122_xor_syndrome,
+ raid6_have_avx512,
+ "avx512x2",
+ .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 AVX2 implementation
+ */
+static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0\n\t"
+ "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */
+ "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */
+ "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */
+ "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */
+ "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */
+ "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */
+ "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */
+ "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */
+ "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */
+ :
+ : "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0; d < bytes; d += 256) {
+ for (z = z0; z >= 0; z--) {
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ "prefetchnta %2\n\t"
+ "prefetchnta %3\n\t"
+ "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
+ "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
+ "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpmovm2b %%k3,%%zmm13\n\t"
+ "vpmovm2b %%k4,%%zmm15\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+ "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vmovdqa64 %2,%%zmm13\n\t"
+ "vmovdqa64 %3,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+ "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+ "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
+ }
+ asm volatile("vmovntdq %%zmm2,%0\n\t"
+ "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
+ "vmovntdq %%zmm3,%1\n\t"
+ "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
+ "vmovntdq %%zmm10,%2\n\t"
+ "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
+ "vmovntdq %%zmm11,%3\n\t"
+ "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
+ "vmovntdq %%zmm4,%4\n\t"
+ "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vmovntdq %%zmm6,%5\n\t"
+ "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vmovntdq %%zmm12,%6\n\t"
+ "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vmovntdq %%zmm14,%7\n\t"
+ "vpxorq %%zmm14,%%zmm14,%%zmm14"
+ :
+ : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+ "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+ "m" (q[d+128]), "m" (q[d+192]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_avx5124_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("vmovdqa64 %0,%%zmm0"
+ :: "m" (raid6_avx512_constants.x1d[0]));
+
+ for (d = 0 ; d < bytes ; d += 256) {
+ asm volatile("vmovdqa64 %0,%%zmm4\n\t"
+ "vmovdqa64 %1,%%zmm6\n\t"
+ "vmovdqa64 %2,%%zmm12\n\t"
+ "vmovdqa64 %3,%%zmm14\n\t"
+ "vmovdqa64 %4,%%zmm2\n\t"
+ "vmovdqa64 %5,%%zmm3\n\t"
+ "vmovdqa64 %6,%%zmm10\n\t"
+ "vmovdqa64 %7,%%zmm11\n\t"
+ "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t"
+ "vpxorq %%zmm14,%%zmm11,%%zmm11"
+ :
+ : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]),
+ "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]),
+ "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+ "m" (p[d+192]));
+ /* P/Q data pages */
+ for (z = z0-1 ; z >= start ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+ "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+ "prefetchnta %0\n\t"
+ "prefetchnta %2\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+ "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpmovm2b %%k3,%%zmm13\n\t"
+ "vpmovm2b %%k4,%%zmm15\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+ "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
+ "vmovdqa64 %0,%%zmm5\n\t"
+ "vmovdqa64 %1,%%zmm7\n\t"
+ "vmovdqa64 %2,%%zmm13\n\t"
+ "vmovdqa64 %3,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
+ "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
+ "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
+ "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14"
+ :
+ : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
+ "m" (dptr[z][d+128]),
+ "m" (dptr[z][d+192]));
+ }
+ asm volatile("prefetchnta %0\n\t"
+ "prefetchnta %1\n\t"
+ :
+ : "m" (q[d]), "m" (q[d+128]));
+ /* P/Q left side optimization */
+ for (z = start-1 ; z >= 0 ; z--) {
+ asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t"
+ "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t"
+ "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t"
+ "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t"
+ "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t"
+ "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t"
+ "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t"
+ "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t"
+ "vpmovm2b %%k1,%%zmm5\n\t"
+ "vpmovm2b %%k2,%%zmm7\n\t"
+ "vpmovm2b %%k3,%%zmm13\n\t"
+ "vpmovm2b %%k4,%%zmm15\n\t"
+ "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
+ "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
+ "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
+ "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
+ "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
+ "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
+ "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
+ "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
+ "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
+ "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
+ "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
+ "vpxorq %%zmm15,%%zmm14,%%zmm14"
+ :
+ : );
+ }
+ asm volatile("vmovntdq %%zmm2,%0\n\t"
+ "vmovntdq %%zmm3,%1\n\t"
+ "vmovntdq %%zmm10,%2\n\t"
+ "vmovntdq %%zmm11,%3\n\t"
+ "vpxorq %4,%%zmm4,%%zmm4\n\t"
+ "vpxorq %5,%%zmm6,%%zmm6\n\t"
+ "vpxorq %6,%%zmm12,%%zmm12\n\t"
+ "vpxorq %7,%%zmm14,%%zmm14\n\t"
+ "vmovntdq %%zmm4,%4\n\t"
+ "vmovntdq %%zmm6,%5\n\t"
+ "vmovntdq %%zmm12,%6\n\t"
+ "vmovntdq %%zmm14,%7"
+ :
+ : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
+ "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
+ "m" (q[d+128]), "m" (q[d+192]));
+ }
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+const struct raid6_calls raid6_avx512x4 = {
+ raid6_avx5124_gen_syndrome,
+ raid6_avx5124_xor_syndrome,
+ raid6_have_avx512,
+ "avx512x4",
+ .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */
+};
+#endif
diff --git a/lib/raid/raid6/x86/mmx.c b/lib/raid/raid6/x86/mmx.c
new file mode 100644
index 000000000000..7e9810669347
--- /dev/null
+++ b/lib/raid/raid6/x86/mmx.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6/mmx.c
+ *
+ * MMX implementation of RAID-6 syndrome functions
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+/* Shared with raid6/sse1.c */
+const struct raid6_mmx_constants {
+ u64 x1d;
+} raid6_mmx_constants = {
+ 0x1d1d1d1d1d1d1d1dULL,
+};
+
+static int raid6_have_mmx(void)
+{
+ /* Not really "boot_cpu" but "all_cpus" */
+ return boot_cpu_has(X86_FEATURE_MMX);
+}
+
+/*
+ * Plain MMX implementation
+ */
+static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+ asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+
+ for ( d = 0 ; d < bytes ; d += 8 ) {
+ asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+ asm volatile("movq %mm2,%mm4"); /* Q[0] */
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
+ asm volatile("pcmpgtb %mm4,%mm5");
+ asm volatile("paddb %mm4,%mm4");
+ asm volatile("pand %mm0,%mm5");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm5,%mm5");
+ asm volatile("pxor %mm6,%mm2");
+ asm volatile("pxor %mm6,%mm4");
+ }
+ asm volatile("movq %%mm2,%0" : "=m" (p[d]));
+ asm volatile("pxor %mm2,%mm2");
+ asm volatile("movq %%mm4,%0" : "=m" (q[d]));
+ asm volatile("pxor %mm4,%mm4");
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_mmxx1 = {
+ raid6_mmx1_gen_syndrome,
+ NULL, /* XOR not yet implemented */
+ raid6_have_mmx,
+ "mmxx1",
+ 0
+};
+
+/*
+ * Unrolled-by-2 MMX implementation
+ */
+static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+ asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+ asm volatile("pxor %mm7,%mm7"); /* Zero temp */
+
+ for ( d = 0 ; d < bytes ; d += 16 ) {
+ asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+ asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8]));
+ asm volatile("movq %mm2,%mm4"); /* Q[0] */
+ asm volatile("movq %mm3,%mm6"); /* Q[1] */
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ asm volatile("pcmpgtb %mm4,%mm5");
+ asm volatile("pcmpgtb %mm6,%mm7");
+ asm volatile("paddb %mm4,%mm4");
+ asm volatile("paddb %mm6,%mm6");
+ asm volatile("pand %mm0,%mm5");
+ asm volatile("pand %mm0,%mm7");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm7,%mm6");
+ asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
+ asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
+ asm volatile("pxor %mm5,%mm2");
+ asm volatile("pxor %mm7,%mm3");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm7,%mm6");
+ asm volatile("pxor %mm5,%mm5");
+ asm volatile("pxor %mm7,%mm7");
+ }
+ asm volatile("movq %%mm2,%0" : "=m" (p[d]));
+ asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
+ asm volatile("movq %%mm4,%0" : "=m" (q[d]));
+ asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_mmxx2 = {
+ raid6_mmx2_gen_syndrome,
+ NULL, /* XOR not yet implemented */
+ raid6_have_mmx,
+ "mmxx2",
+ 0
+};
diff --git a/lib/raid/raid6/x86/recov_avx2.c b/lib/raid/raid6/x86/recov_avx2.c
new file mode 100644
index 000000000000..19fbd9c4dce6
--- /dev/null
+++ b/lib/raid/raid6/x86/recov_avx2.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+static int raid6_has_avx2(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX);
+}
+
+static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ /* ymm0 = x0f[16] */
+ asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0]));
+ asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32]));
+ asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0]));
+ asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32]));
+ asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0]));
+ asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32]));
+ asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0]));
+ asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32]));
+
+ /*
+ * 1 = dq[0] ^ q[0]
+ * 9 = dq[32] ^ q[32]
+ * 0 = dp[0] ^ p[0]
+ * 8 = dp[32] ^ p[32]
+ */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %ymm1, %ymm3");
+ asm volatile("vpsraw $4, %ymm9, %ymm12");
+ asm volatile("vpand %ymm7, %ymm1, %ymm1");
+ asm volatile("vpand %ymm7, %ymm9, %ymm9");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpand %ymm7, %ymm12, %ymm12");
+ asm volatile("vpshufb %ymm9, %ymm4, %ymm14");
+ asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm12, %ymm5, %ymm15");
+ asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+ asm volatile("vpxor %ymm14, %ymm15, %ymm15");
+ asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+ /*
+ * 5 = qx[0]
+ * 15 = qx[32]
+ */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+ asm volatile("vpsraw $4, %ymm0, %ymm2");
+ asm volatile("vpsraw $4, %ymm8, %ymm6");
+ asm volatile("vpand %ymm7, %ymm0, %ymm3");
+ asm volatile("vpand %ymm7, %ymm8, %ymm14");
+ asm volatile("vpand %ymm7, %ymm2, %ymm2");
+ asm volatile("vpand %ymm7, %ymm6, %ymm6");
+ asm volatile("vpshufb %ymm14, %ymm4, %ymm12");
+ asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm6, %ymm1, %ymm13");
+ asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm12, %ymm13, %ymm13");
+
+ /*
+ * 1 = pbmul[px[0]]
+ * 13 = pbmul[px[32]]
+ */
+ asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm15, %ymm13, %ymm13");
+
+ /*
+ * 1 = db = DQ
+ * 13 = db[32] = DQ[32]
+ */
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+ asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32]));
+ asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+ asm volatile("vpxor %ymm13, %ymm8, %ymm8");
+
+ asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+ asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dp += 64;
+ dq += 64;
+#else
+ asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q));
+ asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p));
+ asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq));
+ asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp));
+
+ /* 1 = dq ^ q; 0 = dp ^ p */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16]));
+
+ /*
+ * 1 = dq ^ q
+ * 3 = dq ^ p >> 4
+ */
+ asm volatile("vpsraw $4, %ymm1, %ymm3");
+ asm volatile("vpand %ymm7, %ymm1, %ymm1");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpshufb %ymm1, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm3, %ymm5, %ymm5");
+ asm volatile("vpxor %ymm4, %ymm5, %ymm5");
+
+ /* 5 = qx */
+
+ asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16]));
+
+ asm volatile("vpsraw $4, %ymm0, %ymm2");
+ asm volatile("vpand %ymm7, %ymm0, %ymm3");
+ asm volatile("vpand %ymm7, %ymm2, %ymm2");
+ asm volatile("vpshufb %ymm3, %ymm4, %ymm4");
+ asm volatile("vpshufb %ymm2, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm4, %ymm1, %ymm1");
+
+ /* 1 = pbmul[px] */
+ asm volatile("vpxor %ymm5, %ymm1, %ymm1");
+ /* 1 = db = DQ */
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+
+ asm volatile("vpxor %ymm1, %ymm0, %ymm0");
+ asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0]));
+
+ bytes -= 32;
+ p += 32;
+ q += 32;
+ dp += 32;
+ dq += 32;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+ asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32]));
+ asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+ asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32]));
+
+ /*
+ * 3 = q[0] ^ dq[0]
+ * 8 = q[32] ^ dq[32]
+ */
+ asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+ asm volatile("vmovapd %ymm0, %ymm13");
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+ asm volatile("vmovapd %ymm1, %ymm14");
+
+ asm volatile("vpsraw $4, %ymm3, %ymm6");
+ asm volatile("vpsraw $4, %ymm8, %ymm12");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpand %ymm7, %ymm8, %ymm8");
+ asm volatile("vpand %ymm7, %ymm6, %ymm6");
+ asm volatile("vpand %ymm7, %ymm12, %ymm12");
+ asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+ asm volatile("vpshufb %ymm8, %ymm13, %ymm13");
+ asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+ asm volatile("vpshufb %ymm12, %ymm14, %ymm14");
+ asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm13, %ymm14, %ymm14");
+
+ /*
+ * 1 = qmul[q[0] ^ dq[0]]
+ * 14 = qmul[q[32] ^ dq[32]]
+ */
+ asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+ asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32]));
+ asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+ asm volatile("vpxor %ymm14, %ymm12, %ymm12");
+
+ /*
+ * 2 = p[0] ^ qmul[q[0] ^ dq[0]]
+ * 12 = p[32] ^ qmul[q[32] ^ dq[32]]
+ */
+
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+ asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32]));
+ asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+ asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dq += 64;
+#else
+ asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0]));
+ asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0]));
+
+ /* 3 = q ^ dq */
+
+ asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0]));
+ asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %ymm3, %ymm6");
+ asm volatile("vpand %ymm7, %ymm3, %ymm3");
+ asm volatile("vpand %ymm7, %ymm6, %ymm6");
+ asm volatile("vpshufb %ymm3, %ymm0, %ymm0");
+ asm volatile("vpshufb %ymm6, %ymm1, %ymm1");
+ asm volatile("vpxor %ymm0, %ymm1, %ymm1");
+
+ /* 1 = qmul[q ^ dq] */
+
+ asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0]));
+ asm volatile("vpxor %ymm1, %ymm2, %ymm2");
+
+ /* 2 = p ^ qmul[q ^ dq] */
+
+ asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0]));
+ asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0]));
+
+ bytes -= 32;
+ p += 32;
+ q += 32;
+ dq += 32;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx2 = {
+ .data2 = raid6_2data_recov_avx2,
+ .datap = raid6_datap_recov_avx2,
+ .valid = raid6_has_avx2,
+#ifdef CONFIG_X86_64
+ .name = "avx2x2",
+#else
+ .name = "avx2x1",
+#endif
+ .priority = 2,
+};
diff --git a/lib/raid/raid6/x86/recov_avx512.c b/lib/raid/raid6/x86/recov_avx512.c
new file mode 100644
index 000000000000..143f4976b2ad
--- /dev/null
+++ b/lib/raid/raid6/x86/recov_avx512.c
@@ -0,0 +1,377 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Gayatri Kammela <gayatri.kammela@intel.com>
+ * Author: Megha Dey <megha.dey@linux.intel.com>
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+static int raid6_has_avx512(void)
+{
+ return boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW) &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512DQ);
+}
+
+static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /*
+ * Compute syndrome with zero for the missing data pages
+ * Use the dead data pages as temporary storage for
+ * delta p and delta q
+ */
+
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ /* zmm0 = x0f[16] */
+ asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+ "vmovdqa64 %1, %%zmm9\n\t"
+ "vmovdqa64 %2, %%zmm0\n\t"
+ "vmovdqa64 %3, %%zmm8\n\t"
+ "vpxorq %4, %%zmm1, %%zmm1\n\t"
+ "vpxorq %5, %%zmm9, %%zmm9\n\t"
+ "vpxorq %6, %%zmm0, %%zmm0\n\t"
+ "vpxorq %7, %%zmm8, %%zmm8"
+ :
+ : "m" (q[0]), "m" (q[64]), "m" (p[0]),
+ "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
+ "m" (dp[0]), "m" (dp[64]));
+
+ /*
+ * 1 = dq[0] ^ q[0]
+ * 9 = dq[64] ^ q[64]
+ * 0 = dp[0] ^ p[0]
+ * 8 = dp[64] ^ p[64]
+ */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm5"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+ "vpsraw $4, %%zmm9, %%zmm12\n\t"
+ "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+ "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+ "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
+ "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
+ "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+ "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
+ "vpxorq %%zmm4, %%zmm5, %%zmm5"
+ :
+ : );
+
+ /*
+ * 5 = qx[0]
+ * 15 = qx[64]
+ */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm1\n\t"
+ "vpsraw $4, %%zmm0, %%zmm2\n\t"
+ "vpsraw $4, %%zmm8, %%zmm6\n\t"
+ "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
+ "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+ "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+ "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
+ "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
+ "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm12, %%zmm13, %%zmm13"
+ :
+ : "m" (pbmul[0]), "m" (pbmul[16]));
+
+ /*
+ * 1 = pbmul[px[0]]
+ * 13 = pbmul[px[64]]
+ */
+ asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm15, %%zmm13, %%zmm13"
+ :
+ : );
+
+ /*
+ * 1 = db = DQ
+ * 13 = db[64] = DQ[64]
+ */
+ asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+ "vmovdqa64 %%zmm13,%1\n\t"
+ "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+ "vpxorq %%zmm13, %%zmm8, %%zmm8"
+ :
+ : "m" (dq[0]), "m" (dq[64]));
+
+ asm volatile("vmovdqa64 %%zmm0, %0\n\t"
+ "vmovdqa64 %%zmm8, %1"
+ :
+ : "m" (dp[0]), "m" (dp[64]));
+
+ bytes -= 128;
+ p += 128;
+ q += 128;
+ dp += 128;
+ dq += 128;
+#else
+ asm volatile("vmovdqa64 %0, %%zmm1\n\t"
+ "vmovdqa64 %1, %%zmm0\n\t"
+ "vpxorq %2, %%zmm1, %%zmm1\n\t"
+ "vpxorq %3, %%zmm0, %%zmm0"
+ :
+ : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
+
+ /* 1 = dq ^ q; 0 = dp ^ p */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm5"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ /*
+ * 1 = dq ^ q
+ * 3 = dq ^ p >> 4
+ */
+ asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
+ "vpxorq %%zmm4, %%zmm5, %%zmm5"
+ :
+ : );
+
+ /* 5 = qx */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
+ "vbroadcasti64x2 %1, %%zmm1"
+ :
+ : "m" (pbmul[0]), "m" (pbmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
+ "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
+ "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
+ "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm4, %%zmm1, %%zmm1"
+ :
+ : );
+
+ /* 1 = pbmul[px] */
+ asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
+ /* 1 = db = DQ */
+ "vmovdqa64 %%zmm1, %0\n\t"
+ :
+ : "m" (dq[0]));
+
+ asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
+ "vmovdqa64 %%zmm0, %0"
+ :
+ : "m" (dp[0]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dp += 64;
+ dq += 64;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ const u8 x0f = 0x0f;
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /*
+ * Compute syndrome with zero for the missing data page
+ * Use the dead data page as temporary storage for delta q
+ */
+
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+ "vmovdqa64 %1, %%zmm8\n\t"
+ "vpxorq %2, %%zmm3, %%zmm3\n\t"
+ "vpxorq %3, %%zmm8, %%zmm8"
+ :
+ : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
+ "m" (q[64]));
+
+ /*
+ * 3 = q[0] ^ dq[0]
+ * 8 = q[64] ^ dq[64]
+ */
+ asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+ "vmovapd %%zmm0, %%zmm13\n\t"
+ "vbroadcasti64x2 %1, %%zmm1\n\t"
+ "vmovapd %%zmm1, %%zmm14"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+ "vpsraw $4, %%zmm8, %%zmm12\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
+ "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+ "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
+ "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+ "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
+ "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+ "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
+ "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm13, %%zmm14, %%zmm14"
+ :
+ : );
+
+ /*
+ * 1 = qmul[q[0] ^ dq[0]]
+ * 14 = qmul[q[64] ^ dq[64]]
+ */
+ asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+ "vmovdqa64 %1, %%zmm12\n\t"
+ "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
+ "vpxorq %%zmm14, %%zmm12, %%zmm12"
+ :
+ : "m" (p[0]), "m" (p[64]));
+
+ /*
+ * 2 = p[0] ^ qmul[q[0] ^ dq[0]]
+ * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
+ */
+
+ asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+ "vmovdqa64 %%zmm14, %1\n\t"
+ "vmovdqa64 %%zmm2, %2\n\t"
+ "vmovdqa64 %%zmm12,%3"
+ :
+ : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
+ "m" (p[64]));
+
+ bytes -= 128;
+ p += 128;
+ q += 128;
+ dq += 128;
+#else
+ asm volatile("vmovdqa64 %0, %%zmm3\n\t"
+ "vpxorq %1, %%zmm3, %%zmm3"
+ :
+ : "m" (dq[0]), "m" (q[0]));
+
+ /* 3 = q ^ dq */
+
+ asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
+ "vbroadcasti64x2 %1, %%zmm1"
+ :
+ : "m" (qmul[0]), "m" (qmul[16]));
+
+ asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
+ "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
+ "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
+ "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
+ "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
+ "vpxorq %%zmm0, %%zmm1, %%zmm1"
+ :
+ : );
+
+ /* 1 = qmul[q ^ dq] */
+
+ asm volatile("vmovdqa64 %0, %%zmm2\n\t"
+ "vpxorq %%zmm1, %%zmm2, %%zmm2"
+ :
+ : "m" (p[0]));
+
+ /* 2 = p ^ qmul[q ^ dq] */
+
+ asm volatile("vmovdqa64 %%zmm1, %0\n\t"
+ "vmovdqa64 %%zmm2, %1"
+ :
+ : "m" (dq[0]), "m" (p[0]));
+
+ bytes -= 64;
+ p += 64;
+ q += 64;
+ dq += 64;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_avx512 = {
+ .data2 = raid6_2data_recov_avx512,
+ .datap = raid6_datap_recov_avx512,
+ .valid = raid6_has_avx512,
+#ifdef CONFIG_X86_64
+ .name = "avx512x2",
+#else
+ .name = "avx512x1",
+#endif
+ .priority = 3,
+};
diff --git a/lib/raid/raid6/x86/recov_ssse3.c b/lib/raid/raid6/x86/recov_ssse3.c
new file mode 100644
index 000000000000..146cdbf465bd
--- /dev/null
+++ b/lib/raid/raid6/x86/recov_ssse3.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2012 Intel Corporation
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+static int raid6_has_ssse3(void)
+{
+ return boot_cpu_has(X86_FEATURE_XMM) &&
+ boot_cpu_has(X86_FEATURE_XMM2) &&
+ boot_cpu_has(X86_FEATURE_SSSE3);
+}
+
+static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila,
+ int failb, void **ptrs)
+{
+ u8 *p, *q, *dp, *dq;
+ const u8 *pbmul; /* P multiplier table for B data */
+ const u8 *qmul; /* Q multiplier table (for both) */
+ static const u8 __aligned(16) x0f[16] = {
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-2] = dp;
+ dq = (u8 *)ptrs[failb];
+ ptrs[failb] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+ ptrs[disks-2] = p;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
+ raid6_gfexp[failb]]];
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0]));
+
+#ifdef CONFIG_X86_64
+ asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0]));
+ asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0]));
+ asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16]));
+#endif
+
+ /* Now do it... */
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ /* xmm6, xmm14, xmm15 */
+
+ asm volatile("movdqa %0,%%xmm1" : : "m" (q[0]));
+ asm volatile("movdqa %0,%%xmm9" : : "m" (q[16]));
+ asm volatile("movdqa %0,%%xmm0" : : "m" (p[0]));
+ asm volatile("movdqa %0,%%xmm8" : : "m" (p[16]));
+ asm volatile("pxor %0,%%xmm1" : : "m" (dq[0]));
+ asm volatile("pxor %0,%%xmm9" : : "m" (dq[16]));
+ asm volatile("pxor %0,%%xmm0" : : "m" (dp[0]));
+ asm volatile("pxor %0,%%xmm8" : : "m" (dp[16]));
+
+ /* xmm0/8 = px */
+
+ asm volatile("movdqa %xmm6,%xmm4");
+ asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+ asm volatile("movdqa %xmm6,%xmm12");
+ asm volatile("movdqa %xmm5,%xmm13");
+ asm volatile("movdqa %xmm1,%xmm3");
+ asm volatile("movdqa %xmm9,%xmm11");
+ asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */
+ asm volatile("movdqa %xmm8,%xmm10");
+ asm volatile("psraw $4,%xmm1");
+ asm volatile("psraw $4,%xmm9");
+ asm volatile("pand %xmm7,%xmm3");
+ asm volatile("pand %xmm7,%xmm11");
+ asm volatile("pand %xmm7,%xmm1");
+ asm volatile("pand %xmm7,%xmm9");
+ asm volatile("pshufb %xmm3,%xmm4");
+ asm volatile("pshufb %xmm11,%xmm12");
+ asm volatile("pshufb %xmm1,%xmm5");
+ asm volatile("pshufb %xmm9,%xmm13");
+ asm volatile("pxor %xmm4,%xmm5");
+ asm volatile("pxor %xmm12,%xmm13");
+
+ /* xmm5/13 = qx */
+
+ asm volatile("movdqa %xmm14,%xmm4");
+ asm volatile("movdqa %xmm15,%xmm1");
+ asm volatile("movdqa %xmm14,%xmm12");
+ asm volatile("movdqa %xmm15,%xmm9");
+ asm volatile("movdqa %xmm2,%xmm3");
+ asm volatile("movdqa %xmm10,%xmm11");
+ asm volatile("psraw $4,%xmm2");
+ asm volatile("psraw $4,%xmm10");
+ asm volatile("pand %xmm7,%xmm3");
+ asm volatile("pand %xmm7,%xmm11");
+ asm volatile("pand %xmm7,%xmm2");
+ asm volatile("pand %xmm7,%xmm10");
+ asm volatile("pshufb %xmm3,%xmm4");
+ asm volatile("pshufb %xmm11,%xmm12");
+ asm volatile("pshufb %xmm2,%xmm1");
+ asm volatile("pshufb %xmm10,%xmm9");
+ asm volatile("pxor %xmm4,%xmm1");
+ asm volatile("pxor %xmm12,%xmm9");
+
+ /* xmm1/9 = pbmul[px] */
+ asm volatile("pxor %xmm5,%xmm1");
+ asm volatile("pxor %xmm13,%xmm9");
+ /* xmm1/9 = db = DQ */
+ asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0]));
+ asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16]));
+
+ asm volatile("pxor %xmm1,%xmm0");
+ asm volatile("pxor %xmm9,%xmm8");
+ asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0]));
+ asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16]));
+
+ bytes -= 32;
+ p += 32;
+ q += 32;
+ dp += 32;
+ dq += 32;
+#else
+ asm volatile("movdqa %0,%%xmm1" : : "m" (*q));
+ asm volatile("movdqa %0,%%xmm0" : : "m" (*p));
+ asm volatile("pxor %0,%%xmm1" : : "m" (*dq));
+ asm volatile("pxor %0,%%xmm0" : : "m" (*dp));
+
+ /* 1 = dq ^ q
+ * 0 = dp ^ p
+ */
+ asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0]));
+ asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16]));
+
+ asm volatile("movdqa %xmm1,%xmm3");
+ asm volatile("psraw $4,%xmm1");
+ asm volatile("pand %xmm7,%xmm3");
+ asm volatile("pand %xmm7,%xmm1");
+ asm volatile("pshufb %xmm3,%xmm4");
+ asm volatile("pshufb %xmm1,%xmm5");
+ asm volatile("pxor %xmm4,%xmm5");
+
+ asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */
+
+ /* xmm5 = qx */
+
+ asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0]));
+ asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16]));
+ asm volatile("movdqa %xmm2,%xmm3");
+ asm volatile("psraw $4,%xmm2");
+ asm volatile("pand %xmm7,%xmm3");
+ asm volatile("pand %xmm7,%xmm2");
+ asm volatile("pshufb %xmm3,%xmm4");
+ asm volatile("pshufb %xmm2,%xmm1");
+ asm volatile("pxor %xmm4,%xmm1");
+
+ /* xmm1 = pbmul[px] */
+ asm volatile("pxor %xmm5,%xmm1");
+ /* xmm1 = db = DQ */
+ asm volatile("movdqa %%xmm1,%0" : "=m" (*dq));
+
+ asm volatile("pxor %xmm1,%xmm0");
+ asm volatile("movdqa %%xmm0,%0" : "=m" (*dp));
+
+ bytes -= 16;
+ p += 16;
+ q += 16;
+ dp += 16;
+ dq += 16;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+
+static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila,
+ void **ptrs)
+{
+ u8 *p, *q, *dq;
+ const u8 *qmul; /* Q multiplier table */
+ static const u8 __aligned(16) x0f[16] = {
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
+ 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f};
+
+ p = (u8 *)ptrs[disks-2];
+ q = (u8 *)ptrs[disks-1];
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = (u8 *)ptrs[faila];
+ ptrs[faila] = raid6_get_zero_page();
+ ptrs[disks-1] = dq;
+
+ raid6_call.gen_syndrome(disks, bytes, ptrs);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+ ptrs[disks-1] = q;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0]));
+
+ while (bytes) {
+#ifdef CONFIG_X86_64
+ asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+ asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16]));
+ asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+ asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+
+ /* xmm3 = q[0] ^ dq[0] */
+
+ asm volatile("pxor %0, %%xmm4" : : "m" (q[16]));
+ asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+ /* xmm4 = q[16] ^ dq[16] */
+
+ asm volatile("movdqa %xmm3, %xmm6");
+ asm volatile("movdqa %xmm4, %xmm8");
+
+ /* xmm4 = xmm8 = q[16] ^ dq[16] */
+
+ asm volatile("psraw $4, %xmm3");
+ asm volatile("pand %xmm7, %xmm6");
+ asm volatile("pand %xmm7, %xmm3");
+ asm volatile("pshufb %xmm6, %xmm0");
+ asm volatile("pshufb %xmm3, %xmm1");
+ asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0]));
+ asm volatile("pxor %xmm0, %xmm1");
+ asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16]));
+
+ /* xmm1 = qmul[q[0] ^ dq[0]] */
+
+ asm volatile("psraw $4, %xmm4");
+ asm volatile("pand %xmm7, %xmm8");
+ asm volatile("pand %xmm7, %xmm4");
+ asm volatile("pshufb %xmm8, %xmm10");
+ asm volatile("pshufb %xmm4, %xmm11");
+ asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+ asm volatile("pxor %xmm10, %xmm11");
+ asm volatile("movdqa %0, %%xmm12" : : "m" (p[16]));
+
+ /* xmm11 = qmul[q[16] ^ dq[16]] */
+
+ asm volatile("pxor %xmm1, %xmm2");
+
+ /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */
+
+ asm volatile("pxor %xmm11, %xmm12");
+
+ /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */
+
+ asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+ asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16]));
+
+ asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+ asm volatile("movdqa %%xmm12, %0" : "=m" (p[16]));
+
+ bytes -= 32;
+ p += 32;
+ q += 32;
+ dq += 32;
+
+#else
+ asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0]));
+ asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0]));
+ asm volatile("pxor %0, %%xmm3" : : "m" (q[0]));
+ asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16]));
+
+ /* xmm3 = *q ^ *dq */
+
+ asm volatile("movdqa %xmm3, %xmm6");
+ asm volatile("movdqa %0, %%xmm2" : : "m" (p[0]));
+ asm volatile("psraw $4, %xmm3");
+ asm volatile("pand %xmm7, %xmm6");
+ asm volatile("pand %xmm7, %xmm3");
+ asm volatile("pshufb %xmm6, %xmm0");
+ asm volatile("pshufb %xmm3, %xmm1");
+ asm volatile("pxor %xmm0, %xmm1");
+
+ /* xmm1 = qmul[*q ^ *dq */
+
+ asm volatile("pxor %xmm1, %xmm2");
+
+ /* xmm2 = *p ^ qmul[*q ^ *dq] */
+
+ asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0]));
+ asm volatile("movdqa %%xmm2, %0" : "=m" (p[0]));
+
+ bytes -= 16;
+ p += 16;
+ q += 16;
+ dq += 16;
+#endif
+ }
+
+ kernel_fpu_end();
+}
+
+const struct raid6_recov_calls raid6_recov_ssse3 = {
+ .data2 = raid6_2data_recov_ssse3,
+ .datap = raid6_datap_recov_ssse3,
+ .valid = raid6_has_ssse3,
+#ifdef CONFIG_X86_64
+ .name = "ssse3x2",
+#else
+ .name = "ssse3x1",
+#endif
+ .priority = 1,
+};
diff --git a/lib/raid/raid6/x86/sse1.c b/lib/raid/raid6/x86/sse1.c
new file mode 100644
index 000000000000..deecdd72ceec
--- /dev/null
+++ b/lib/raid/raid6/x86/sse1.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6/sse1.c
+ *
+ * SSE-1/MMXEXT implementation of RAID-6 syndrome functions
+ *
+ * This is really an MMX implementation, but it requires SSE-1 or
+ * AMD MMXEXT for prefetch support and a few other features. The
+ * support for nontemporal memory accesses is enough to make this
+ * worthwhile as a separate implementation.
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+/* Defined in raid6/mmx.c */
+extern const struct raid6_mmx_constants {
+ u64 x1d;
+} raid6_mmx_constants;
+
+static int raid6_have_sse1_or_mmxext(void)
+{
+ /* Not really boot_cpu but "all_cpus" */
+ return boot_cpu_has(X86_FEATURE_MMX) &&
+ (boot_cpu_has(X86_FEATURE_XMM) ||
+ boot_cpu_has(X86_FEATURE_MMXEXT));
+}
+
+/*
+ * Plain SSE1 implementation
+ */
+static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+ asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+
+ for ( d = 0 ; d < bytes ; d += 8 ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+ asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+ asm volatile("movq %mm2,%mm4"); /* Q[0] */
+ asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d]));
+ for ( z = z0-2 ; z >= 0 ; z-- ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("pcmpgtb %mm4,%mm5");
+ asm volatile("paddb %mm4,%mm4");
+ asm volatile("pand %mm0,%mm5");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm5,%mm5");
+ asm volatile("pxor %mm6,%mm2");
+ asm volatile("pxor %mm6,%mm4");
+ asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
+ }
+ asm volatile("pcmpgtb %mm4,%mm5");
+ asm volatile("paddb %mm4,%mm4");
+ asm volatile("pand %mm0,%mm5");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm5,%mm5");
+ asm volatile("pxor %mm6,%mm2");
+ asm volatile("pxor %mm6,%mm4");
+
+ asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
+ asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse1x1 = {
+ raid6_sse11_gen_syndrome,
+ NULL, /* XOR not yet implemented */
+ raid6_have_sse1_or_mmxext,
+ "sse1x1",
+ 1 /* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 SSE1 implementation
+ */
+static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
+ asm volatile("pxor %mm5,%mm5"); /* Zero temp */
+ asm volatile("pxor %mm7,%mm7"); /* Zero temp */
+
+ /* We uniformly assume a single prefetch covers at least 16 bytes */
+ for ( d = 0 ; d < bytes ; d += 16 ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+ asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
+ asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */
+ asm volatile("movq %mm2,%mm4"); /* Q[0] */
+ asm volatile("movq %mm3,%mm6"); /* Q[1] */
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("pcmpgtb %mm4,%mm5");
+ asm volatile("pcmpgtb %mm6,%mm7");
+ asm volatile("paddb %mm4,%mm4");
+ asm volatile("paddb %mm6,%mm6");
+ asm volatile("pand %mm0,%mm5");
+ asm volatile("pand %mm0,%mm7");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm7,%mm6");
+ asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
+ asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
+ asm volatile("pxor %mm5,%mm2");
+ asm volatile("pxor %mm7,%mm3");
+ asm volatile("pxor %mm5,%mm4");
+ asm volatile("pxor %mm7,%mm6");
+ asm volatile("pxor %mm5,%mm5");
+ asm volatile("pxor %mm7,%mm7");
+ }
+ asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
+ asm volatile("movntq %%mm3,%0" : "=m" (p[d+8]));
+ asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
+ asm volatile("movntq %%mm6,%0" : "=m" (q[d+8]));
+ }
+
+ asm volatile("sfence" : :: "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse1x2 = {
+ raid6_sse12_gen_syndrome,
+ NULL, /* XOR not yet implemented */
+ raid6_have_sse1_or_mmxext,
+ "sse1x2",
+ 1 /* Has cache hints */
+};
diff --git a/lib/raid/raid6/x86/sse2.c b/lib/raid/raid6/x86/sse2.c
new file mode 100644
index 000000000000..f9edf8a8d1c4
--- /dev/null
+++ b/lib/raid/raid6/x86/sse2.c
@@ -0,0 +1,480 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6/sse2.c
+ *
+ * SSE-2 implementation of RAID-6 syndrome functions
+ *
+ */
+
+#include <linux/raid/pq.h>
+#include <asm/fpu/api.h>
+
+static const struct raid6_sse_constants {
+ u64 x1d[2];
+} raid6_sse_constants __attribute__((aligned(16))) = {
+ { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
+};
+
+static int raid6_have_sse2(void)
+{
+ /* Not really boot_cpu but "all_cpus" */
+ return boot_cpu_has(X86_FEATURE_MMX) &&
+ boot_cpu_has(X86_FEATURE_FXSR) &&
+ boot_cpu_has(X86_FEATURE_XMM) &&
+ boot_cpu_has(X86_FEATURE_XMM2);
+}
+
+/*
+ * Plain SSE2 implementation
+ */
+static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+ asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
+
+ for ( d = 0 ; d < bytes ; d += 16 ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+ asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
+ asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
+ asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
+ for ( z = z0-2 ; z >= 0 ; z-- ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm6,%xmm2");
+ asm volatile("pxor %xmm6,%xmm4");
+ asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
+ }
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm6,%xmm2");
+ asm volatile("pxor %xmm6,%xmm4");
+
+ asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+ asm volatile("pxor %xmm2,%xmm2");
+ asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+ asm volatile("pxor %xmm4,%xmm4");
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+
+static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+
+ for ( d = 0 ; d < bytes ; d += 16 ) {
+ asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
+ asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
+ asm volatile("pxor %xmm4,%xmm2");
+ /* P/Q data pages */
+ for ( z = z0-1 ; z >= start ; z-- ) {
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+ asm volatile("pxor %xmm5,%xmm2");
+ asm volatile("pxor %xmm5,%xmm4");
+ }
+ /* P/Q left side optimization */
+ for ( z = start-1 ; z >= 0 ; z-- ) {
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pxor %xmm5,%xmm4");
+ }
+ asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
+ /* Don't use movntdq for r/w memory area < cache line */
+ asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
+ asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse2x1 = {
+ raid6_sse21_gen_syndrome,
+ raid6_sse21_xor_syndrome,
+ raid6_have_sse2,
+ "sse2x1",
+ 1 /* Has cache hints */
+};
+
+/*
+ * Unrolled-by-2 SSE2 implementation
+ */
+static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+ asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
+ asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
+
+ /* We uniformly assume a single prefetch covers at least 32 bytes */
+ for ( d = 0 ; d < bytes ; d += 32 ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
+ asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
+ asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
+ asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
+ asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("pcmpgtb %xmm6,%xmm7");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("paddb %xmm6,%xmm6");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pand %xmm0,%xmm7");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
+ asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
+ asm volatile("pxor %xmm5,%xmm2");
+ asm volatile("pxor %xmm7,%xmm3");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm7,%xmm7");
+ }
+ asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+ asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+ asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+ asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
+
+ for ( d = 0 ; d < bytes ; d += 32 ) {
+ asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
+ asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
+ asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
+ asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
+ asm volatile("pxor %xmm4,%xmm2");
+ asm volatile("pxor %xmm6,%xmm3");
+ /* P/Q data pages */
+ for ( z = z0-1 ; z >= start ; z-- ) {
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm7,%xmm7");
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("pcmpgtb %xmm6,%xmm7");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("paddb %xmm6,%xmm6");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pand %xmm0,%xmm7");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+ asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+ asm volatile("pxor %xmm5,%xmm2");
+ asm volatile("pxor %xmm7,%xmm3");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ }
+ /* P/Q left side optimization */
+ for ( z = start-1 ; z >= 0 ; z-- ) {
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm7,%xmm7");
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("pcmpgtb %xmm6,%xmm7");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("paddb %xmm6,%xmm6");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pand %xmm0,%xmm7");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ }
+ asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
+ asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
+ /* Don't use movntdq for r/w memory area < cache line */
+ asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
+ asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
+ asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
+ asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+const struct raid6_calls raid6_sse2x2 = {
+ raid6_sse22_gen_syndrome,
+ raid6_sse22_xor_syndrome,
+ raid6_have_sse2,
+ "sse2x2",
+ 1 /* Has cache hints */
+};
+
+#ifdef CONFIG_X86_64
+
+/*
+ * Unrolled-by-4 SSE2 implementation
+ */
+static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
+ asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
+ asm volatile("pxor %xmm3,%xmm3"); /* P[1] */
+ asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
+ asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
+ asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */
+ asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
+ asm volatile("pxor %xmm10,%xmm10"); /* P[2] */
+ asm volatile("pxor %xmm11,%xmm11"); /* P[3] */
+ asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */
+ asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */
+ asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */
+ asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */
+
+ for ( d = 0 ; d < bytes ; d += 64 ) {
+ for ( z = z0 ; z >= 0 ; z-- ) {
+ /* The second prefetch seems to improve performance... */
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("pcmpgtb %xmm6,%xmm7");
+ asm volatile("pcmpgtb %xmm12,%xmm13");
+ asm volatile("pcmpgtb %xmm14,%xmm15");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("paddb %xmm6,%xmm6");
+ asm volatile("paddb %xmm12,%xmm12");
+ asm volatile("paddb %xmm14,%xmm14");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pand %xmm0,%xmm7");
+ asm volatile("pand %xmm0,%xmm13");
+ asm volatile("pand %xmm0,%xmm15");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("pxor %xmm13,%xmm12");
+ asm volatile("pxor %xmm15,%xmm14");
+ asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+ asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+ asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
+ asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
+ asm volatile("pxor %xmm5,%xmm2");
+ asm volatile("pxor %xmm7,%xmm3");
+ asm volatile("pxor %xmm13,%xmm10");
+ asm volatile("pxor %xmm15,%xmm11");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("pxor %xmm13,%xmm12");
+ asm volatile("pxor %xmm15,%xmm14");
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm7,%xmm7");
+ asm volatile("pxor %xmm13,%xmm13");
+ asm volatile("pxor %xmm15,%xmm15");
+ }
+ asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+ asm volatile("pxor %xmm2,%xmm2");
+ asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+ asm volatile("pxor %xmm3,%xmm3");
+ asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
+ asm volatile("pxor %xmm10,%xmm10");
+ asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
+ asm volatile("pxor %xmm11,%xmm11");
+ asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+ asm volatile("pxor %xmm4,%xmm4");
+ asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+ asm volatile("pxor %xmm6,%xmm6");
+ asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
+ asm volatile("pxor %xmm12,%xmm12");
+ asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
+ asm volatile("pxor %xmm14,%xmm14");
+ }
+
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
+ size_t bytes, void **ptrs)
+{
+ u8 **dptr = (u8 **)ptrs;
+ u8 *p, *q;
+ int d, z, z0;
+
+ z0 = stop; /* P/Q right side optimization */
+ p = dptr[disks-2]; /* XOR parity */
+ q = dptr[disks-1]; /* RS syndrome */
+
+ kernel_fpu_begin();
+
+ asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
+
+ for ( d = 0 ; d < bytes ; d += 64 ) {
+ asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
+ asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
+ asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
+ asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
+ asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
+ asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
+ asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
+ asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
+ asm volatile("pxor %xmm4,%xmm2");
+ asm volatile("pxor %xmm6,%xmm3");
+ asm volatile("pxor %xmm12,%xmm10");
+ asm volatile("pxor %xmm14,%xmm11");
+ /* P/Q data pages */
+ for ( z = z0-1 ; z >= start ; z-- ) {
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
+ asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm7,%xmm7");
+ asm volatile("pxor %xmm13,%xmm13");
+ asm volatile("pxor %xmm15,%xmm15");
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("pcmpgtb %xmm6,%xmm7");
+ asm volatile("pcmpgtb %xmm12,%xmm13");
+ asm volatile("pcmpgtb %xmm14,%xmm15");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("paddb %xmm6,%xmm6");
+ asm volatile("paddb %xmm12,%xmm12");
+ asm volatile("paddb %xmm14,%xmm14");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pand %xmm0,%xmm7");
+ asm volatile("pand %xmm0,%xmm13");
+ asm volatile("pand %xmm0,%xmm15");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("pxor %xmm13,%xmm12");
+ asm volatile("pxor %xmm15,%xmm14");
+ asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
+ asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
+ asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
+ asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
+ asm volatile("pxor %xmm5,%xmm2");
+ asm volatile("pxor %xmm7,%xmm3");
+ asm volatile("pxor %xmm13,%xmm10");
+ asm volatile("pxor %xmm15,%xmm11");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("pxor %xmm13,%xmm12");
+ asm volatile("pxor %xmm15,%xmm14");
+ }
+ asm volatile("prefetchnta %0" :: "m" (q[d]));
+ asm volatile("prefetchnta %0" :: "m" (q[d+32]));
+ /* P/Q left side optimization */
+ for ( z = start-1 ; z >= 0 ; z-- ) {
+ asm volatile("pxor %xmm5,%xmm5");
+ asm volatile("pxor %xmm7,%xmm7");
+ asm volatile("pxor %xmm13,%xmm13");
+ asm volatile("pxor %xmm15,%xmm15");
+ asm volatile("pcmpgtb %xmm4,%xmm5");
+ asm volatile("pcmpgtb %xmm6,%xmm7");
+ asm volatile("pcmpgtb %xmm12,%xmm13");
+ asm volatile("pcmpgtb %xmm14,%xmm15");
+ asm volatile("paddb %xmm4,%xmm4");
+ asm volatile("paddb %xmm6,%xmm6");
+ asm volatile("paddb %xmm12,%xmm12");
+ asm volatile("paddb %xmm14,%xmm14");
+ asm volatile("pand %xmm0,%xmm5");
+ asm volatile("pand %xmm0,%xmm7");
+ asm volatile("pand %xmm0,%xmm13");
+ asm volatile("pand %xmm0,%xmm15");
+ asm volatile("pxor %xmm5,%xmm4");
+ asm volatile("pxor %xmm7,%xmm6");
+ asm volatile("pxor %xmm13,%xmm12");
+ asm volatile("pxor %xmm15,%xmm14");
+ }
+ asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
+ asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
+ asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
+ asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
+ asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
+ asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
+ asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
+ asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
+ asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
+ asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
+ asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
+ asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
+ }
+ asm volatile("sfence" : : : "memory");
+ kernel_fpu_end();
+}
+
+
+const struct raid6_calls raid6_sse2x4 = {
+ raid6_sse24_gen_syndrome,
+ raid6_sse24_xor_syndrome,
+ raid6_have_sse2,
+ "sse2x4",
+ 1 /* Has cache hints */
+};
+
+#endif /* CONFIG_X86_64 */