diff options
| author | Christoph Hellwig <hch@lst.de> | 2026-05-18 07:17:46 +0200 |
|---|---|---|
| committer | Andrew Morton <akpm@linux-foundation.org> | 2026-05-28 21:24:53 -0700 |
| commit | 3626738bc7147d52cb49f3994a9846aa2d34810a (patch) | |
| tree | 683cf2b4f8028ae1d915dcd5a549047d0971e311 /lib/raid/raid6 | |
| parent | 3d6beb659ddf0664612bafacb0bd9030ba6ec7e6 (diff) | |
| download | lwn-3626738bc7147d52cb49f3994a9846aa2d34810a.tar.gz lwn-3626738bc7147d52cb49f3994a9846aa2d34810a.zip | |
raid6: move to lib/raid/
Move the raid6 code to live in lib/raid/ with the XOR code, and change the
internal organization so that each architecture has a subdirectory similar
to the CRC, crypto and XOR libraries, and fix up the Makefile to only
build files actually needed.
Also move the kunit test case from the history test/ subdirectory to
tests/ and use the normal naming scheme for it.
Link: https://lore.kernel.org/20260518051804.462141-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Ard Biesheuvel <ardb@kernel.org> # kunit only on arm64
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alex@ghiti.fr>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Borislav Petkov (AMD)" <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Mason <clm@fb.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Nan <linan122@huawei.com>
Cc: Madhavan Srinivasan <maddy@linux.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Song Liu <song@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'lib/raid/raid6')
31 files changed, 7161 insertions, 0 deletions
diff --git a/lib/raid/raid6/.gitignore b/lib/raid/raid6/.gitignore new file mode 100644 index 000000000000..6be57745afd1 --- /dev/null +++ b/lib/raid/raid6/.gitignore @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-only +mktables +altivec*.c +int*.c +tables.c +neon?.c +s390vx?.c +vpermxor*.c diff --git a/lib/raid/raid6/Makefile b/lib/raid/raid6/Makefile new file mode 100644 index 000000000000..7cb31b8a5c17 --- /dev/null +++ b/lib/raid/raid6/Makefile @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I $(src) + +obj-$(CONFIG_RAID6_PQ) += raid6_pq.o tests/ + +raid6_pq-y += algos.o tables.o + +# generic integer generation and recovery implementation +raid6_pq-y += int1.o int2.o int4.o int8.o +raid6_pq-y += recov.o + +# architecture-specific generation and recovery implementations: +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += arm/neon.o \ + arm/neon1.o \ + arm/neon2.o \ + arm/neon4.o \ + arm/neon8.o \ + arm/recov_neon.o \ + arm/recov_neon_inner.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch/loongarch_simd.o \ + loongarch/recov_loongarch_simd.o +raid6_pq-$(CONFIG_ALTIVEC) += powerpc/altivec1.o \ + powerpc/altivec2.o \ + powerpc/altivec4.o \ + powerpc/altivec8.o \ + powerpc/vpermxor1.o \ + powerpc/vpermxor2.o \ + powerpc/vpermxor4.o \ + powerpc/vpermxor8.o +raid6_pq-$(CONFIG_RISCV_ISA_V) += riscv/rvv.o \ + riscv/recov_rvv.o +raid6_pq-$(CONFIG_S390) += s390/s390vx8.o \ + s390/recov_s390xc.o +ifeq ($(CONFIG_X86),y) +raid6_pq-$(CONFIG_X86_32) += x86/mmx.o \ + x86/sse1.o +endif +raid6_pq-$(CONFIG_X86) += x86/sse2.o \ + x86/avx2.o \ + x86/avx512.o \ + x86/recov_ssse3.o \ + x86/recov_avx2.o \ + x86/recov_avx512.o + +hostprogs += mktables + +CFLAGS_arm/neon1.o += $(CC_FLAGS_FPU) +CFLAGS_arm/neon2.o += $(CC_FLAGS_FPU) +CFLAGS_arm/neon4.o += $(CC_FLAGS_FPU) +CFLAGS_arm/neon8.o += $(CC_FLAGS_FPU) +CFLAGS_arm/recov_neon_inner.o += $(CC_FLAGS_FPU) +CFLAGS_REMOVE_arm/neon1.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_arm/neon2.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_arm/neon4.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_arm/neon8.o += $(CC_FLAGS_NO_FPU) +CFLAGS_REMOVE_arm/recov_neon_inner.o += $(CC_FLAGS_NO_FPU) + +ifeq ($(CONFIG_ALTIVEC),y) +altivec_flags := -maltivec $(call cc-option,-mabi=altivec) +# Enable <altivec.h> +altivec_flags += -isystem $(shell $(CC) -print-file-name=include) + +CFLAGS_powerpc/altivec1.o += $(altivec_flags) +CFLAGS_powerpc/altivec2.o += $(altivec_flags) +CFLAGS_powerpc/altivec4.o += $(altivec_flags) +CFLAGS_powerpc/altivec8.o += $(altivec_flags) +CFLAGS_powerpc/vpermxor1.o += $(altivec_flags) +CFLAGS_powerpc/vpermxor2.o += $(altivec_flags) +CFLAGS_powerpc/vpermxor4.o += $(altivec_flags) +CFLAGS_powerpc/vpermxor8.o += $(altivec_flags) + +ifdef CONFIG_CC_IS_CLANG +# clang ppc port does not yet support -maltivec when -msoft-float is +# enabled. A future release of clang will resolve this +# https://llvm.org/pr31177 +CFLAGS_REMOVE_powerpc/altivec1.o += -msoft-float +CFLAGS_REMOVE_powerpc/altivec2.o += -msoft-float +CFLAGS_REMOVE_powerpc/altivec4.o += -msoft-float +CFLAGS_REMOVE_powerpc/altivec8.o += -msoft-float +CFLAGS_REMOVE_powerpc/vpermxor1.o += -msoft-float +CFLAGS_REMOVE_powerpc/vpermxor2.o += -msoft-float +CFLAGS_REMOVE_powerpc/vpermxor4.o += -msoft-float +CFLAGS_REMOVE_powerpc/vpermxor8.o += -msoft-float +endif # CONFIG_CC_IS_CLANG +endif # CONFIG_ALTIVEC + +quiet_cmd_mktable = TABLE $@ + cmd_mktable = $(obj)/mktables > $@ + +targets += tables.c +$(obj)/tables.c: $(obj)/mktables FORCE + $(call if_changed,mktable) + +quiet_cmd_unroll = UNROLL $@ + cmd_unroll = $(AWK) -v N=$* -f $(src)/unroll.awk < $< > $@ + +targets += int1.c int2.c int4.c int8.c +$(obj)/int%.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += arm/neon1.c arm/neon2.c arm/neon4.c arm/neon8.c +$(obj)/arm/neon%.c: $(src)/arm/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += powerpc/altivec1.c \ + powerpc/altivec2.c \ + powerpc/altivec4.c \ + powerpc/altivec8.c +$(obj)/powerpc/altivec%.c: $(src)/powerpc/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += powerpc/vpermxor1.c \ + powerpc/vpermxor2.c \ + powerpc/vpermxor4.c \ + powerpc/vpermxor8.c +$(obj)/powerpc/vpermxor%.c: $(src)/powerpc/vpermxor.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += s390/s390vx8.c +$(obj)/s390/s390vx%.c: $(src)/s390/s390vx.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) diff --git a/lib/raid/raid6/algos.c b/lib/raid/raid6/algos.c new file mode 100644 index 000000000000..985c60bb00a4 --- /dev/null +++ b/lib/raid/raid6/algos.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/algos.c + * + * Algorithm list and algorithm selection for RAID-6 + */ + +#include <linux/raid/pq.h> +#include <linux/module.h> +#include <linux/gfp.h> +#include <kunit/visibility.h> + +struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); + +const struct raid6_calls * const raid6_algos[] = { +#if defined(__i386__) && !defined(__arch_um__) + &raid6_avx512x2, + &raid6_avx512x1, + &raid6_avx2x2, + &raid6_avx2x1, + &raid6_sse2x2, + &raid6_sse2x1, + &raid6_sse1x2, + &raid6_sse1x1, + &raid6_mmxx2, + &raid6_mmxx1, +#endif +#if defined(__x86_64__) && !defined(__arch_um__) + &raid6_avx512x4, + &raid6_avx512x2, + &raid6_avx512x1, + &raid6_avx2x4, + &raid6_avx2x2, + &raid6_avx2x1, + &raid6_sse2x4, + &raid6_sse2x2, + &raid6_sse2x1, +#endif +#ifdef CONFIG_ALTIVEC + &raid6_vpermxor8, + &raid6_vpermxor4, + &raid6_vpermxor2, + &raid6_vpermxor1, + &raid6_altivec8, + &raid6_altivec4, + &raid6_altivec2, + &raid6_altivec1, +#endif +#if defined(CONFIG_S390) + &raid6_s390vx8, +#endif +#ifdef CONFIG_KERNEL_MODE_NEON + &raid6_neonx8, + &raid6_neonx4, + &raid6_neonx2, + &raid6_neonx1, +#endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_lsx, +#endif +#endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_rvvx1, + &raid6_rvvx2, + &raid6_rvvx4, + &raid6_rvvx8, +#endif + &raid6_intx8, + &raid6_intx4, + &raid6_intx2, + &raid6_intx1, + NULL +}; +EXPORT_SYMBOL_IF_KUNIT(raid6_algos); + +void (*raid6_2data_recov)(int, size_t, int, int, void **); +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +void (*raid6_datap_recov)(int, size_t, int, void **); +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +const struct raid6_recov_calls *const raid6_recov_algos[] = { +#ifdef CONFIG_X86 + &raid6_recov_avx512, + &raid6_recov_avx2, + &raid6_recov_ssse3, +#endif +#ifdef CONFIG_S390 + &raid6_recov_s390xc, +#endif +#if defined(CONFIG_KERNEL_MODE_NEON) + &raid6_recov_neon, +#endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_recov_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_recov_lsx, +#endif +#endif +#ifdef CONFIG_RISCV_ISA_V + &raid6_recov_rvv, +#endif + &raid6_recov_intx1, + NULL +}; +EXPORT_SYMBOL_IF_KUNIT(raid6_recov_algos); + +#define RAID6_TIME_JIFFIES_LG2 4 +#define RAID6_TEST_DISKS 8 +#define RAID6_TEST_DISKS_ORDER 3 + +static inline const struct raid6_recov_calls *raid6_choose_recov(void) +{ + const struct raid6_recov_calls *const *algo; + const struct raid6_recov_calls *best; + + for (best = NULL, algo = raid6_recov_algos; *algo; algo++) + if (!best || (*algo)->priority > best->priority) + if (!(*algo)->valid || (*algo)->valid()) + best = *algo; + + if (best) { + raid6_2data_recov = best->data2; + raid6_datap_recov = best->datap; + + pr_info("raid6: using %s recovery algorithm\n", best->name); + } else + pr_err("raid6: Yikes! No recovery algorithm found!\n"); + + return best; +} + +static inline const struct raid6_calls *raid6_choose_gen( + void *(*const dptrs)[RAID6_TEST_DISKS], const int disks) +{ + unsigned long perf, bestgenperf, j0, j1; + int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ + const struct raid6_calls *const *algo; + const struct raid6_calls *best; + + for (bestgenperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + if (!best || (*algo)->priority >= best->priority) { + if ((*algo)->valid && !(*algo)->valid()) + continue; + + if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + best = *algo; + break; + } + + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { + (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + if (perf > bestgenperf) { + bestgenperf = perf; + best = *algo; + } + pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, + (perf * HZ * (disks-2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); + } + } + + if (!best) { + pr_err("raid6: Yikes! No algorithm found!\n"); + goto out; + } + + raid6_call = *best; + + if (!IS_ENABLED(CONFIG_RAID6_PQ_BENCHMARK)) { + pr_info("raid6: skipped pq benchmark and selected %s\n", + best->name); + goto out; + } + + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", + best->name, + (bestgenperf * HZ * (disks - 2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2)); + + if (best->xor_syndrome) { + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1 << RAID6_TIME_JIFFIES_LG2))) { + best->xor_syndrome(disks, start, stop, + PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (perf * HZ * (disks - 2)) >> + (20 - PAGE_SHIFT + RAID6_TIME_JIFFIES_LG2 + 1)); + } + +out: + return best; +} + + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +static int __init raid6_select_algo(void) +{ + const int disks = RAID6_TEST_DISKS; + + const struct raid6_calls *gen_best; + const struct raid6_recov_calls *rec_best; + char *disk_ptr, *p; + void *dptrs[RAID6_TEST_DISKS]; + int i, cycle; + + /* prepare the buffer and fill it circularly with gfmul table */ + disk_ptr = (char *)__get_free_pages(GFP_KERNEL, RAID6_TEST_DISKS_ORDER); + if (!disk_ptr) { + pr_err("raid6: Yikes! No memory available.\n"); + return -ENOMEM; + } + + p = disk_ptr; + for (i = 0; i < disks; i++) + dptrs[i] = p + PAGE_SIZE * i; + + cycle = ((disks - 2) * PAGE_SIZE) / 65536; + for (i = 0; i < cycle; i++) { + memcpy(p, raid6_gfmul, 65536); + p += 65536; + } + + if ((disks - 2) * PAGE_SIZE % 65536) + memcpy(p, raid6_gfmul, (disks - 2) * PAGE_SIZE % 65536); + + /* select raid gen_syndrome function */ + gen_best = raid6_choose_gen(&dptrs, disks); + + /* select raid recover functions */ + rec_best = raid6_choose_recov(); + + free_pages((unsigned long)disk_ptr, RAID6_TEST_DISKS_ORDER); + + return gen_best && rec_best ? 0 : -EINVAL; +} + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); diff --git a/lib/raid/raid6/arm/neon.c b/lib/raid/raid6/arm/neon.c new file mode 100644 index 000000000000..47b8bb0afc65 --- /dev/null +++ b/lib/raid/raid6/arm/neon.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + */ + +#include <linux/raid/pq.h> +#include <asm/simd.h> + +/* + * There are 2 reasons these wrappers are kept in a separate compilation unit + * from the actual implementations in neonN.c (generated from neon.uc by + * unroll.awk): + * - the actual implementations use NEON intrinsics, and the GCC support header + * (arm_neon.h) is not fully compatible (type wise) with the kernel; + * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, + * and we have to make sure that we never use *any* NEON/VFP instructions + * outside a kernel_neon_begin()/kernel_neon_end() pair. + */ + +#define RAID6_NEON_WRAPPER(_n) \ + static void raid6_neon ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _gen_syndrome_real(int, \ + unsigned long, void**); \ + scoped_ksimd() \ + raid6_neon ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + } \ + static void raid6_neon ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _xor_syndrome_real(int, \ + int, int, unsigned long, void**); \ + scoped_ksimd() \ + raid6_neon ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs);\ + } \ + struct raid6_calls const raid6_neonx ## _n = { \ + raid6_neon ## _n ## _gen_syndrome, \ + raid6_neon ## _n ## _xor_syndrome, \ + raid6_have_neon, \ + "neonx" #_n, \ + 0 \ + } + +static int raid6_have_neon(void) +{ + return cpu_has_neon(); +} + +RAID6_NEON_WRAPPER(1); +RAID6_NEON_WRAPPER(2); +RAID6_NEON_WRAPPER(4); +RAID6_NEON_WRAPPER(8); diff --git a/lib/raid/raid6/arm/neon.h b/lib/raid/raid6/arm/neon.h new file mode 100644 index 000000000000..2ca41ee9b499 --- /dev/null +++ b/lib/raid/raid6/arm/neon.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + + diff --git a/lib/raid/raid6/arm/neon.uc b/lib/raid/raid6/arm/neon.uc new file mode 100644 index 000000000000..14a9fc2c60fa --- /dev/null +++ b/lib/raid/raid6/arm/neon.uc @@ -0,0 +1,153 @@ +/* ----------------------------------------------------------------------- + * + * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions + * + * Copyright (C) 2012 Rob Herring + * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * Based on altivec.uc: + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * neon$#.c + * + * $#-way unrolled NEON intrinsics math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <arm_neon.h> +#include "arm/neon.h" + +typedef uint8x16_t unative_t; + +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline unative_t SHLBYTE(unative_t v) +{ + return vshlq_n_u8(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline unative_t MASK(unative_t v) +{ + return (unative_t)vshrq_n_s8((int8x16_t)v, 7); +} + +static inline unative_t PMUL(unative_t v, unative_t u) +{ + return (unative_t)vmulq_p8((poly8x16_t)v, (poly8x16_t)u); +} + +void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + const unative_t x1d = vdupq_n_u8(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); + wp$$ = veorq_u8(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + w1$$ = veorq_u8(w1$$, w2$$); + wq$$ = veorq_u8(w1$$, wd$$); + } + vst1q_u8(&p[d+NSIZE*$$], wp$$); + vst1q_u8(&q[d+NSIZE*$$], wq$$); + } +} + +void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + const unative_t x1d = vdupq_n_u8(0x1d); + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); + wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$); + + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); + wp$$ = veorq_u8(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + w1$$ = veorq_u8(w1$$, w2$$); + wq$$ = veorq_u8(w1$$, wd$$); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 3 ; z -= 4 ) { + w2$$ = vshrq_n_u8(wq$$, 4); + w1$$ = vshlq_n_u8(wq$$, 4); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + } + + switch (z) { + case 2: + w2$$ = vshrq_n_u8(wq$$, 5); + w1$$ = vshlq_n_u8(wq$$, 3); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + break; + case 1: + w2$$ = vshrq_n_u8(wq$$, 6); + w1$$ = vshlq_n_u8(wq$$, 2); + + w2$$ = PMUL(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + break; + case 0: + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + wq$$ = veorq_u8(w1$$, w2$$); + } + w1$$ = vld1q_u8(&q[d+NSIZE*$$]); + wq$$ = veorq_u8(wq$$, w1$$); + + vst1q_u8(&p[d+NSIZE*$$], wp$$); + vst1q_u8(&q[d+NSIZE*$$], wq$$); + } +} diff --git a/lib/raid/raid6/arm/recov_neon.c b/lib/raid/raid6/arm/recov_neon.c new file mode 100644 index 000000000000..5a48fcc762e8 --- /dev/null +++ b/lib/raid/raid6/arm/recov_neon.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <linux/raid/pq.h> +#include <asm/simd.h> +#include "arm/neon.h" + +static int raid6_has_neon(void) +{ + return cpu_has_neon(); +} + +static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + scoped_ksimd() + __raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul); +} + +static void raid6_datap_recov_neon(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + scoped_ksimd() + __raid6_datap_recov_neon(bytes, p, q, dq, qmul); +} + +const struct raid6_recov_calls raid6_recov_neon = { + .data2 = raid6_2data_recov_neon, + .datap = raid6_datap_recov_neon, + .valid = raid6_has_neon, + .name = "neon", + .priority = 10, +}; diff --git a/lib/raid/raid6/arm/recov_neon_inner.c b/lib/raid/raid6/arm/recov_neon_inner.c new file mode 100644 index 000000000000..53c355efa7ff --- /dev/null +++ b/lib/raid/raid6/arm/recov_neon_inner.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Intel Corporation + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + */ + +#include <arm_neon.h> +#include "arm/neon.h" + +#ifdef CONFIG_ARM +/* + * AArch32 does not provide this intrinsic natively because it does not + * implement the underlying instruction. AArch32 only provides a 64-bit + * wide vtbl.8 instruction, so use that instead. + */ +static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b) +{ + union { + uint8x16_t val; + uint8x8x2_t pair; + } __a = { a }; + + return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)), + vtbl2_u8(__a.pair, vget_high_u8(b))); +} +#endif + +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul) +{ + uint8x16_t pm0 = vld1q_u8(pbmul); + uint8x16_t pm1 = vld1q_u8(pbmul + 16); + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + uint8x16_t x0f = vdupq_n_u8(0x0f); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy, px, qx, db; + + px = veorq_u8(vld1q_u8(p), vld1q_u8(dp)); + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = vshrq_n_u8(vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vy); + qx = veorq_u8(vx, vy); + + vy = vshrq_n_u8(px, 4); + vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f)); + vy = vqtbl1q_u8(pm1, vy); + vx = veorq_u8(vx, vy); + db = veorq_u8(vx, qx); + + vst1q_u8(dq, db); + vst1q_u8(dp, veorq_u8(db, px)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul) +{ + uint8x16_t qm0 = vld1q_u8(qmul); + uint8x16_t qm1 = vld1q_u8(qmul + 16); + uint8x16_t x0f = vdupq_n_u8(0x0f); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + + while (bytes) { + uint8x16_t vx, vy; + + vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq)); + + vy = vshrq_n_u8(vx, 4); + vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f)); + vy = vqtbl1q_u8(qm1, vy); + vx = veorq_u8(vx, vy); + vy = veorq_u8(vx, vld1q_u8(p)); + + vst1q_u8(dq, vx); + vst1q_u8(p, vy); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} diff --git a/lib/raid/raid6/int.uc b/lib/raid/raid6/int.uc new file mode 100644 index 000000000000..1ba56c3fa482 --- /dev/null +++ b/lib/raid/raid6/int.uc @@ -0,0 +1,147 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * int$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <linux/raid/pq.h> + +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE 8 +# define NSHIFT 3 +# define NSTRING "64" +typedef u64 unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE 4 +# define NSHIFT 2 +# define NSTRING "32" +typedef u32 unative_t; +#endif + + + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + unative_t vv; + + vv = (v << 1) & NBYTES(0xfe); + return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t vv; + + vv = v & NBYTES(0x80); + vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ + return vv; +} + + +static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_int$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + /* P/Q data pages */ + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= start ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + wq$$ = w1$$ ^ w2$$; + } + *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + } + +} + +const struct raid6_calls raid6_intx$# = { + raid6_int$#_gen_syndrome, + raid6_int$#_xor_syndrome, + NULL, /* always valid */ + "int" NSTRING "x$#", + 0 +}; diff --git a/lib/raid/raid6/loongarch/loongarch_simd.c b/lib/raid/raid6/loongarch/loongarch_simd.c new file mode 100644 index 000000000000..72f4d92d4876 --- /dev/null +++ b/lib/raid/raid6/loongarch/loongarch_simd.c @@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX) + * + * Copyright 2023 WANG Xuerui <git@xen0n.name> + * + * Based on the generic RAID-6 code (int.uc): + * + * Copyright 2002-2004 H. Peter Anvin + */ + +#include <linux/raid/pq.h> +#include <asm/cpu-features.h> +#include <asm/fpu.h> + +/* + * The vector algorithms are currently priority 0, which means the generic + * scalar algorithms are not being disabled if vector support is present. + * This is like the similar LoongArch RAID5 XOR code, with the main reason + * repeated here: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +#ifdef CONFIG_CPU_HAS_LSX +#define NSIZE 16 + +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1])); + asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2])); + asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1])); + asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2])); + asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3])); + } + + kernel_fpu_end(); +} + +static void raid6_lsx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr12"); + asm volatile("vxor.v $vr5, $vr17, $vr13"); + asm volatile("vxor.v $vr6, $vr18, $vr14"); + asm volatile("vxor.v $vr7, $vr19, $vr15"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "vld $vr20, %0\n\t" + "vld $vr21, %1\n\t" + "vld $vr22, %2\n\t" + "vld $vr23, %3\n\t" + "vld $vr24, %4\n\t" + "vld $vr25, %5\n\t" + "vld $vr26, %6\n\t" + "vld $vr27, %7\n\t" + "vxor.v $vr20, $vr20, $vr0\n\t" + "vxor.v $vr21, $vr21, $vr1\n\t" + "vxor.v $vr22, $vr22, $vr2\n\t" + "vxor.v $vr23, $vr23, $vr3\n\t" + "vxor.v $vr24, $vr24, $vr4\n\t" + "vxor.v $vr25, $vr25, $vr5\n\t" + "vxor.v $vr26, $vr26, $vr6\n\t" + "vxor.v $vr27, $vr27, $vr7\n\t" + "vst $vr20, %0\n\t" + "vst $vr21, %1\n\t" + "vst $vr22, %2\n\t" + "vst $vr23, %3\n\t" + "vst $vr24, %4\n\t" + "vst $vr25, %5\n\t" + "vst $vr26, %6\n\t" + "vst $vr27, %7\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]), + "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lsx = { + raid6_lsx_gen_syndrome, + raid6_lsx_xor_syndrome, + raid6_has_lsx, + "lsx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; + +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +#define NSIZE 32 + +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1])); + } + + kernel_fpu_end(); +} + +static void raid6_lasx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr6"); + asm volatile("xvxor.v $xr3, $xr9, $xr7"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "xvld $xr10, %0\n\t" + "xvld $xr11, %1\n\t" + "xvld $xr12, %2\n\t" + "xvld $xr13, %3\n\t" + "xvxor.v $xr10, $xr10, $xr0\n\t" + "xvxor.v $xr11, $xr11, $xr1\n\t" + "xvxor.v $xr12, $xr12, $xr2\n\t" + "xvxor.v $xr13, $xr13, $xr3\n\t" + "xvst $xr10, %0\n\t" + "xvst $xr11, %1\n\t" + "xvst $xr12, %2\n\t" + "xvst $xr13, %3\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lasx = { + raid6_lasx_gen_syndrome, + raid6_lasx_xor_syndrome, + raid6_has_lasx, + "lasx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid/raid6/loongarch/recov_loongarch_simd.c b/lib/raid/raid6/loongarch/recov_loongarch_simd.c new file mode 100644 index 000000000000..eb3a1e79f01f --- /dev/null +++ b/lib/raid/raid6/loongarch/recov_loongarch_simd.c @@ -0,0 +1,514 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX) + * + * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> + * + * Originally based on recov_avx2.c and recov_ssse3.c: + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + */ + +#include <linux/raid/pq.h> +#include <asm/cpu-features.h> +#include <asm/fpu.h> + +/* + * Unlike with the syndrome calculation algorithms, there's no boot-time + * selection of recovery algorithms by benchmarking, so we have to specify + * the priorities and hope the future cores will all have decent vector + * support (i.e. no LASX slower than LSX, or even scalar code). + */ + +#ifdef CONFIG_CPU_HAS_LSX +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * vr20, vr21: qmul + * vr22, vr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + + while (bytes) { + /* vr4 - vr7: Q */ + asm volatile("vld $vr4, %0" : : "m" (q[0])); + asm volatile("vld $vr5, %0" : : "m" (q[16])); + asm volatile("vld $vr6, %0" : : "m" (q[32])); + asm volatile("vld $vr7, %0" : : "m" (q[48])); + /* vr4 - vr7: Q + Qxy */ + asm volatile("vld $vr8, %0" : : "m" (dq[0])); + asm volatile("vld $vr9, %0" : : "m" (dq[16])); + asm volatile("vld $vr10, %0" : : "m" (dq[32])); + asm volatile("vld $vr11, %0" : : "m" (dq[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + /* vr0 - vr3: P */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr0 - vr3: P + Pxy */ + asm volatile("vld $vr8, %0" : : "m" (dp[0])); + asm volatile("vld $vr9, %0" : : "m" (dp[16])); + asm volatile("vld $vr10, %0" : : "m" (dp[32])); + asm volatile("vld $vr11, %0" : : "m" (dp[48])); + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4"); + asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5"); + asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6"); + asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8"); + asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9"); + asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10"); + asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11"); + /* vr16 - vr19: B(Q + Qxy) */ + asm volatile("vxor.v $vr16, $vr8, $vr4"); + asm volatile("vxor.v $vr17, $vr9, $vr5"); + asm volatile("vxor.v $vr18, $vr10, $vr6"); + asm volatile("vxor.v $vr19, $vr11, $vr7"); + + /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("vsrli.b $vr4, $vr0, 4"); + asm volatile("vsrli.b $vr5, $vr1, 4"); + asm volatile("vsrli.b $vr6, $vr2, 4"); + asm volatile("vsrli.b $vr7, $vr3, 4"); + /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("vandi.b $vr12, $vr0, 0x0f"); + asm volatile("vandi.b $vr13, $vr1, 0x0f"); + asm volatile("vandi.b $vr14, $vr2, 0x0f"); + asm volatile("vandi.b $vr15, $vr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12"); + asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13"); + asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14"); + asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15"); + /* lookup from pbmul[16] */ + asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4"); + asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5"); + asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6"); + asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7"); + /* vr4 - vr7: A(P + Pxy) */ + asm volatile("vxor.v $vr4, $vr4, $vr12"); + asm volatile("vxor.v $vr5, $vr5, $vr13"); + asm volatile("vxor.v $vr6, $vr6, $vr14"); + asm volatile("vxor.v $vr7, $vr7, $vr15"); + + /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr16"); + asm volatile("vxor.v $vr5, $vr5, $vr17"); + asm volatile("vxor.v $vr6, $vr6, $vr18"); + asm volatile("vxor.v $vr7, $vr7, $vr19"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Pxy + Dx = Dy */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (dp[0])); + asm volatile("vst $vr1, %0" : "=m" (dp[16])); + asm volatile("vst $vr2, %0" : "=m" (dp[32])); + asm volatile("vst $vr3, %0" : "=m" (dp[48])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* vr22, vr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + + while (bytes) { + /* vr0 - vr3: P + Dx */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr4 - vr7: Qx */ + asm volatile("vld $vr4, %0" : : "m" (dq[0])); + asm volatile("vld $vr5, %0" : : "m" (dq[16])); + asm volatile("vld $vr6, %0" : : "m" (dq[32])); + asm volatile("vld $vr7, %0" : : "m" (dq[48])); + /* vr4 - vr7: Q + Qx */ + asm volatile("vld $vr8, %0" : : "m" (q[0])); + asm volatile("vld $vr9, %0" : : "m" (q[16])); + asm volatile("vld $vr10, %0" : : "m" (q[32])); + asm volatile("vld $vr11, %0" : : "m" (q[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4"); + asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5"); + asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6"); + asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8"); + asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9"); + asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10"); + asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11"); + /* vr4 - vr7: qmul(Q + Qx) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Dx + Dx = P */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (p[0])); + asm volatile("vst $vr1, %0" : "=m" (p[16])); + asm volatile("vst $vr2, %0" : "=m" (p[32])); + asm volatile("vst $vr3, %0" : "=m" (p[48])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lsx = { + .data2 = raid6_2data_recov_lsx, + .datap = raid6_datap_recov_lsx, + .valid = raid6_has_lsx, + .name = "lsx", + .priority = 1, +}; +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * xr20, xr21: qmul + * xr22, xr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + asm volatile("xvreplve0.q $xr20, $xr20"); + asm volatile("xvreplve0.q $xr21, $xr21"); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: Q */ + asm volatile("xvld $xr0, %0" : : "m" (q[0])); + asm volatile("xvld $xr1, %0" : : "m" (q[32])); + /* xr0, xr1: Q + Qxy */ + asm volatile("xvld $xr4, %0" : : "m" (dq[0])); + asm volatile("xvld $xr5, %0" : : "m" (dq[32])); + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* xr2, xr3: P */ + asm volatile("xvld $xr2, %0" : : "m" (p[0])); + asm volatile("xvld $xr3, %0" : : "m" (p[32])); + /* xr2, xr3: P + Pxy */ + asm volatile("xvld $xr4, %0" : : "m" (dp[0])); + asm volatile("xvld $xr5, %0" : : "m" (dp[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvsrli.b $xr4, $xr0, 4"); + asm volatile("xvsrli.b $xr5, $xr1, 4"); + /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvandi.b $xr0, $xr0, 0x0f"); + asm volatile("xvandi.b $xr1, $xr1, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0"); + asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4"); + asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5"); + /* xr6, xr7: B(Q + Qxy) */ + asm volatile("xvxor.v $xr6, $xr4, $xr0"); + asm volatile("xvxor.v $xr7, $xr5, $xr1"); + + /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("xvandi.b $xr0, $xr2, 0x0f"); + asm volatile("xvandi.b $xr1, $xr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0"); + asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1"); + /* lookup from pbmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr0, xr1: A(P + Pxy) */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + + /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("xvxor.v $xr0, $xr0, $xr6"); + asm volatile("xvxor.v $xr1, $xr1, $xr7"); + + /* xr2, xr3: P + Pxy + Dx = Dy */ + asm volatile("xvxor.v $xr2, $xr2, $xr0"); + asm volatile("xvxor.v $xr3, $xr3, $xr1"); + + asm volatile("xvst $xr0, %0" : "=m" (dq[0])); + asm volatile("xvst $xr1, %0" : "=m" (dq[32])); + asm volatile("xvst $xr2, %0" : "=m" (dp[0])); + asm volatile("xvst $xr3, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* xr22, xr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: P + Dx */ + asm volatile("xvld $xr0, %0" : : "m" (p[0])); + asm volatile("xvld $xr1, %0" : : "m" (p[32])); + /* xr2, xr3: Qx */ + asm volatile("xvld $xr2, %0" : : "m" (dq[0])); + asm volatile("xvld $xr3, %0" : : "m" (dq[32])); + /* xr2, xr3: Q + Qx */ + asm volatile("xvld $xr4, %0" : : "m" (q[0])); + asm volatile("xvld $xr5, %0" : : "m" (q[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("xvandi.b $xr2, $xr2, 0x0f"); + asm volatile("xvandi.b $xr3, $xr3, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2"); + asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr2, xr3: qmul(Q + Qx) = Dx */ + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr0, xr1: P + Dx + Dx = P */ + asm volatile("xvxor.v $xr0, $xr0, $xr2"); + asm volatile("xvxor.v $xr1, $xr1, $xr3"); + + asm volatile("xvst $xr2, %0" : "=m" (dq[0])); + asm volatile("xvst $xr3, %0" : "=m" (dq[32])); + asm volatile("xvst $xr0, %0" : "=m" (p[0])); + asm volatile("xvst $xr1, %0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lasx = { + .data2 = raid6_2data_recov_lasx, + .datap = raid6_datap_recov_lasx, + .valid = raid6_has_lasx, + .name = "lasx", + .priority = 2, +}; +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid/raid6/mktables.c b/lib/raid/raid6/mktables.c new file mode 100644 index 000000000000..3de1dbf6846c --- /dev/null +++ b/lib/raid/raid6/mktables.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * mktables.c + * + * Make RAID-6 tables. This is a host user space program to be run at + * compile time. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <stdlib.h> +#include <time.h> + +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int main(int argc, char *argv[]) +{ + int i, j, k; + uint8_t v; + uint8_t exptbl[256], invtbl[256]; + + printf("#include <linux/export.h>\n"); + printf("#include <linux/raid/pq.h>\n"); + + /* Compute multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfmul[256][256] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 256; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + + /* Compute vector multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_vgfmul[256][32] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, (j + k) << 4), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); + + /* Compute power-of-2 table (exponent) */ + v = 1; + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexp[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + exptbl[i + j] = v; + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + } + printf("};\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + + /* Compute log-of-2 table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gflog[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + v = 255; + for (k = 0; k < 256; k++) + if (exptbl[k] == (i + j)) { + v = k; + break; + } + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + printf("EXPORT_SYMBOL(raid6_gflog);\n"); + + /* Compute inverse table x^-1 == x^254 */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfinv[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + invtbl[i + j] = v = gfpow(i + j, 254); + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexi[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) + printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); + } + printf("};\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + + return 0; +} diff --git a/lib/raid/raid6/powerpc/altivec.uc b/lib/raid/raid6/powerpc/altivec.uc new file mode 100644 index 000000000000..130d3d3dd42c --- /dev/null +++ b/lib/raid/raid6/powerpc/altivec.uc @@ -0,0 +1,122 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6altivec$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + * + * <benh> hpa: in process, + * you can just "steal" the vec unit with enable_kernel_altivec() (but + * bracked this with preempt_disable/enable or in a lock) + */ + +#include <linux/raid/pq.h> + +#include <altivec.h> +#include <asm/cputable.h> +#include <asm/switch_to.h> + +/* + * This is the C data type to use. We use a vector of + * signed char so vec_cmpgt() will generate the right + * instruction. + */ + +typedef vector signed char unative_t; + +#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + return vec_add(v,v); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t zv = NBYTES(0); + + /* vec_cmpgt returns a vector bool char; thus the need for the cast */ + return (unative_t)vec_cmpgt(zv, v); +} + + +/* This is noinline to make damned sure that gcc doesn't move any of the + Altivec code around the enable/disable code */ +static void noinline +raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + unative_t x1d = NBYTES(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ = vec_xor(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ = vec_and(w2$$, x1d); + w1$$ = vec_xor(w1$$, w2$$); + wq$$ = vec_xor(w1$$, wd$$); + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + preempt_disable(); + enable_kernel_altivec(); + + raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); + + disable_kernel_altivec(); + preempt_enable(); +} + +int raid6_have_altivec(void); +#if $# == 1 +int raid6_have_altivec(void) +{ + /* This assumes either all CPUs have Altivec or none does */ + return cpu_has_feature(CPU_FTR_ALTIVEC); +} +#endif + +const struct raid6_calls raid6_altivec$# = { + raid6_altivec$#_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_altivec, + "altivecx$#", + 0 +}; diff --git a/lib/raid/raid6/powerpc/vpermxor.uc b/lib/raid/raid6/powerpc/vpermxor.uc new file mode 100644 index 000000000000..595f20aaf4cf --- /dev/null +++ b/lib/raid/raid6/powerpc/vpermxor.uc @@ -0,0 +1,95 @@ +/* + * Copyright 2017, Matt Brown, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * vpermxor$#.c + * + * Based on H. Peter Anvin's paper - The mathematics of RAID-6 + * + * $#-way unrolled portable integer math RAID-6 instruction set + * This file is postprocessed using unroll.awk + * + * vpermxor$#.c makes use of the vpermxor instruction to optimise the RAID6 Q + * syndrome calculations. + * This can be run on systems which have both Altivec and vpermxor instruction. + * + * This instruction was introduced in POWER8 - ISA v2.07. + */ + +#include <linux/raid/pq.h> +#include <altivec.h> +#include <asm/ppc-opcode.h> +#include <asm/cputable.h> +#include <asm/switch_to.h> + +typedef vector unsigned char unative_t; +#define NSIZE sizeof(unative_t) + +static const vector unsigned char gf_low = {0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, + 0x12, 0x10, 0x0e, 0x0c, 0x0a, 0x08, + 0x06, 0x04, 0x02,0x00}; +static const vector unsigned char gf_high = {0xfd, 0xdd, 0xbd, 0x9d, 0x7d, 0x5d, + 0x3d, 0x1d, 0xe0, 0xc0, 0xa0, 0x80, + 0x60, 0x40, 0x20, 0x00}; + +static void noinline raid6_vpermxor$#_gen_syndrome_real(int disks, size_t bytes, + void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + unative_t wp$$, wq$$, wd$$; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += NSIZE*$#) { + wp$$ = wq$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + + for (z = z0-1; z>=0; z--) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + /* P syndrome */ + wp$$ = vec_xor(wp$$, wd$$); + + /* Q syndrome */ + asm(VPERMXOR(%0,%1,%2,%3):"=v"(wq$$):"v"(gf_high), "v"(gf_low), "v"(wq$$)); + wq$$ = vec_xor(wq$$, wd$$); + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_vpermxor$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + preempt_disable(); + enable_kernel_altivec(); + + raid6_vpermxor$#_gen_syndrome_real(disks, bytes, ptrs); + + disable_kernel_altivec(); + preempt_enable(); +} + +int raid6_have_altivec_vpermxor(void); +#if $# == 1 +int raid6_have_altivec_vpermxor(void) +{ + /* Check if arch has both altivec and the vpermxor instructions */ + return (cpu_has_feature(CPU_FTR_ALTIVEC_COMP) && + cpu_has_feature(CPU_FTR_ARCH_207S)); +} +#endif + +const struct raid6_calls raid6_vpermxor$# = { + raid6_vpermxor$#_gen_syndrome, + NULL, + raid6_have_altivec_vpermxor, + "vpermxor$#", + 0 +}; diff --git a/lib/raid/raid6/recov.c b/lib/raid/raid6/recov.c new file mode 100644 index 000000000000..8d113196632e --- /dev/null +++ b/lib/raid/raid6/recov.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/recov.c + * + * RAID-6 data recovery in dual failure mode. In single failure mode, + * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct + * the syndrome.) + */ + +#include <linux/raid/pq.h> + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + u8 px, qx, db; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} + + +const struct raid6_recov_calls raid6_recov_intx1 = { + .data2 = raid6_2data_recov_intx1, + .datap = raid6_datap_recov_intx1, + .valid = NULL, + .name = "intx1", + .priority = 0, +}; diff --git a/lib/raid/raid6/riscv/recov_rvv.c b/lib/raid/raid6/riscv/recov_rvv.c new file mode 100644 index 000000000000..40c393206b6a --- /dev/null +++ b/lib/raid/raid6/riscv/recov_rvv.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + */ + +#include <linux/raid/pq.h> +#include "rvv.h" + +static void __raid6_2data_recov_rvv(int bytes, u8 *p, u8 *q, u8 *dp, + u8 *dq, const u8 *pbmul, + const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while ( bytes-- ) { + * uint8_t px, qx, db; + * + * px = *p ^ *dp; + * qx = qmul[*q ^ *dq]; + * *dq++ = db = pbmul[px] ^ qx; + * *dp++ = db ^ px; + * p++; q++; + * } + */ + while (bytes) { + /* + * v0:px, v1:dp, + * v2:qx, v3:dq, + * v4:vx, v5:vy, + * v6:qm0, v7:qm1, + * v8:pm0, v9:pm1, + * v14:p/qm[vx], v15:p/qm[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[px])\n" + "vle8.v v1, (%[dp])\n" + "vxor.vv v0, v0, v1\n" + "vle8.v v2, (%[qx])\n" + "vle8.v v3, (%[dq])\n" + "vxor.vv v4, v2, v3\n" + "vsrl.vi v5, v4, 4\n" + "vand.vi v4, v4, 0xf\n" + "vle8.v v6, (%[qm0])\n" + "vle8.v v7, (%[qm1])\n" + "vrgather.vv v14, v6, v4\n" /* v14 = qm[vx] */ + "vrgather.vv v15, v7, v5\n" /* v15 = qm[vy] */ + "vxor.vv v2, v14, v15\n" /* v2 = qmul[*q ^ *dq] */ + + "vsrl.vi v5, v0, 4\n" + "vand.vi v4, v0, 0xf\n" + "vle8.v v8, (%[pm0])\n" + "vle8.v v9, (%[pm1])\n" + "vrgather.vv v14, v8, v4\n" /* v14 = pm[vx] */ + "vrgather.vv v15, v9, v5\n" /* v15 = pm[vy] */ + "vxor.vv v4, v14, v15\n" /* v4 = pbmul[px] */ + "vxor.vv v3, v4, v2\n" /* v3 = db = pbmul[px] ^ qx */ + "vxor.vv v1, v3, v0\n" /* v1 = db ^ px; */ + "vse8.v v3, (%[dq])\n" + "vse8.v v1, (%[dp])\n" + ".option pop\n" + : : + [px]"r"(p), + [dp]"r"(dp), + [qx]"r"(q), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16), + [pm0]"r"(pbmul), + [pm1]"r"(pbmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; + } +} + +static void __raid6_datap_recov_rvv(int bytes, u8 *p, u8 *q, + u8 *dq, const u8 *qmul) +{ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli x0, %[avl], e8, m1, ta, ma\n" + ".option pop\n" + : : + [avl]"r"(16) + ); + + /* + * while (bytes--) { + * *p++ ^= *dq = qmul[*q ^ *dq]; + * q++; dq++; + * } + */ + while (bytes) { + /* + * v0:vx, v1:vy, + * v2:dq, v3:p, + * v4:qm0, v5:qm1, + * v10:m[vx], v11:m[vy] + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[vx])\n" + "vle8.v v2, (%[dq])\n" + "vxor.vv v0, v0, v2\n" + "vsrl.vi v1, v0, 4\n" + "vand.vi v0, v0, 0xf\n" + "vle8.v v4, (%[qm0])\n" + "vle8.v v5, (%[qm1])\n" + "vrgather.vv v10, v4, v0\n" + "vrgather.vv v11, v5, v1\n" + "vxor.vv v0, v10, v11\n" + "vle8.v v1, (%[vy])\n" + "vxor.vv v1, v0, v1\n" + "vse8.v v0, (%[dq])\n" + "vse8.v v1, (%[vy])\n" + ".option pop\n" + : : + [vx]"r"(q), + [vy]"r"(p), + [dq]"r"(dq), + [qm0]"r"(qmul), + [qm1]"r"(qmul + 16) + :); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; + } +} + +static void raid6_2data_recov_rvv(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_vector_begin(); + __raid6_2data_recov_rvv(bytes, p, q, dp, dq, pbmul, qmul); + kernel_vector_end(); +} + +static void raid6_datap_recov_rvv(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_vector_begin(); + __raid6_datap_recov_rvv(bytes, p, q, dq, qmul); + kernel_vector_end(); +} + +const struct raid6_recov_calls raid6_recov_rvv = { + .data2 = raid6_2data_recov_rvv, + .datap = raid6_datap_recov_rvv, + .valid = rvv_has_vector, + .name = "rvv", + .priority = 1, +}; diff --git a/lib/raid/raid6/riscv/rvv.c b/lib/raid/raid6/riscv/rvv.c new file mode 100644 index 000000000000..75c9dafedb28 --- /dev/null +++ b/lib/raid/raid6/riscv/rvv.c @@ -0,0 +1,1228 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID-6 syndrome calculation using RISC-V vector instructions + * + * Copyright 2024 Institute of Software, CAS. + * Author: Chunyan Zhang <zhangchunyan@iscas.ac.cn> + * + * Based on neon.uc: + * Copyright 2002-2004 H. Peter Anvin + */ + +#include "rvv.h" + +#ifdef __riscv_vector +#error "This code must be built without compiler support for vector" +#endif + +static void raid6_rvv1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0; d < bytes; d += nsize * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]) + ); + + for (z = z0 - 1 ; z >= 0 ; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]) + ); + } +} + +static void raid6_rvv1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 */ + for (d = 0 ; d < bytes ; d += nsize * 1) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]) + ); + } +} + +static void raid6_rvv2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += nsize * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]) + ); + } +} + +static void raid6_rvv2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + */ + for (d = 0; d < bytes; d += nsize * 2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]) + ); + } +} + +static void raid6_rvv4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += nsize * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]) + ); + } +} + +static void raid6_rvv4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + */ + for (d = 0; d < bytes; d += nsize * 4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]) + ); + } +} + +static void raid6_rvv8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += nsize * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + "vle8.v v16, (%[wp4])\n" + "vmv.v.v v17, v16\n" + "vle8.v v20, (%[wp5])\n" + "vmv.v.v v21, v20\n" + "vle8.v v24, (%[wp6])\n" + "vmv.v.v v25, v24\n" + "vle8.v v28, (%[wp7])\n" + "vmv.v.v v29, v28\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]), + [wp4]"r"(&dptr[z0][d + 4 * nsize]), + [wp5]"r"(&dptr[z0][d + 5 * nsize]), + [wp6]"r"(&dptr[z0][d + 6 * nsize]), + [wp7]"r"(&dptr[z0][d + 7 * nsize]) + ); + + for (z = z0 - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [wd4]"r"(&dptr[z][d + 4 * nsize]), + [wd5]"r"(&dptr[z][d + 5 * nsize]), + [wd6]"r"(&dptr[z][d + 6 * nsize]), + [wd7]"r"(&dptr[z][d + 7 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] = wp$$; + * *(unative_t *)&q[d+NSIZE*$$] = wq$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vse8.v v0, (%[wp0])\n" + "vse8.v v1, (%[wq0])\n" + "vse8.v v4, (%[wp1])\n" + "vse8.v v5, (%[wq1])\n" + "vse8.v v8, (%[wp2])\n" + "vse8.v v9, (%[wq2])\n" + "vse8.v v12, (%[wp3])\n" + "vse8.v v13, (%[wq3])\n" + "vse8.v v16, (%[wp4])\n" + "vse8.v v17, (%[wq4])\n" + "vse8.v v20, (%[wp5])\n" + "vse8.v v21, (%[wq5])\n" + "vse8.v v24, (%[wp6])\n" + "vse8.v v25, (%[wq6])\n" + "vse8.v v28, (%[wp7])\n" + "vse8.v v29, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]), + [wp4]"r"(&p[d + nsize * 4]), + [wq4]"r"(&q[d + nsize * 4]), + [wp5]"r"(&p[d + nsize * 5]), + [wq5]"r"(&q[d + nsize * 5]), + [wp6]"r"(&p[d + nsize * 6]), + [wq6]"r"(&q[d + nsize * 6]), + [wp7]"r"(&p[d + nsize * 7]), + [wq7]"r"(&q[d + nsize * 7]) + ); + } +} + +static void raid6_rvv8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + unsigned long vl, d, nsize; + int z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + asm volatile (".option push\n" + ".option arch,+v\n" + "vsetvli %0, x0, e8, m1, ta, ma\n" + ".option pop\n" + : "=&r" (vl) + ); + + nsize = vl; + + /* + * v0:wp0, v1:wq0, v2:wd0/w20, v3:w10 + * v4:wp1, v5:wq1, v6:wd1/w21, v7:w11 + * v8:wp2, v9:wq2, v10:wd2/w22, v11:w12 + * v12:wp3, v13:wq3, v14:wd3/w23, v15:w13 + * v16:wp4, v17:wq4, v18:wd4/w24, v19:w14 + * v20:wp5, v21:wq5, v22:wd5/w25, v23:w15 + * v24:wp6, v25:wq6, v26:wd6/w26, v27:w16 + * v28:wp7, v29:wq7, v30:wd7/w27, v31:w17 + */ + for (d = 0; d < bytes; d += nsize * 8) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v0, (%[wp0])\n" + "vmv.v.v v1, v0\n" + "vle8.v v4, (%[wp1])\n" + "vmv.v.v v5, v4\n" + "vle8.v v8, (%[wp2])\n" + "vmv.v.v v9, v8\n" + "vle8.v v12, (%[wp3])\n" + "vmv.v.v v13, v12\n" + "vle8.v v16, (%[wp4])\n" + "vmv.v.v v17, v16\n" + "vle8.v v20, (%[wp5])\n" + "vmv.v.v v21, v20\n" + "vle8.v v24, (%[wp6])\n" + "vmv.v.v v25, v24\n" + "vle8.v v28, (%[wp7])\n" + "vmv.v.v v29, v28\n" + ".option pop\n" + : : + [wp0]"r"(&dptr[z0][d + 0 * nsize]), + [wp1]"r"(&dptr[z0][d + 1 * nsize]), + [wp2]"r"(&dptr[z0][d + 2 * nsize]), + [wp3]"r"(&dptr[z0][d + 3 * nsize]), + [wp4]"r"(&dptr[z0][d + 4 * nsize]), + [wp5]"r"(&dptr[z0][d + 5 * nsize]), + [wp6]"r"(&dptr[z0][d + 6 * nsize]), + [wp7]"r"(&dptr[z0][d + 7 * nsize]) + ); + + /* P/Q data pages */ + for (z = z0 - 1; z >= start; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * w1$$ ^= w2$$; + * wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + * wq$$ = w1$$ ^ wd$$; + * wp$$ ^= wd$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v3, v3, v2\n" + "vle8.v v2, (%[wd0])\n" + "vxor.vv v1, v3, v2\n" + "vxor.vv v0, v0, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v7, v7, v6\n" + "vle8.v v6, (%[wd1])\n" + "vxor.vv v5, v7, v6\n" + "vxor.vv v4, v4, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v11, v11, v10\n" + "vle8.v v10, (%[wd2])\n" + "vxor.vv v9, v11, v10\n" + "vxor.vv v8, v8, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v15, v15, v14\n" + "vle8.v v14, (%[wd3])\n" + "vxor.vv v13, v15, v14\n" + "vxor.vv v12, v12, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v19, v19, v18\n" + "vle8.v v18, (%[wd4])\n" + "vxor.vv v17, v19, v18\n" + "vxor.vv v16, v16, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v23, v23, v22\n" + "vle8.v v22, (%[wd5])\n" + "vxor.vv v21, v23, v22\n" + "vxor.vv v20, v20, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v27, v27, v26\n" + "vle8.v v26, (%[wd6])\n" + "vxor.vv v25, v27, v26\n" + "vxor.vv v24, v24, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v31, v31, v30\n" + "vle8.v v30, (%[wd7])\n" + "vxor.vv v29, v31, v30\n" + "vxor.vv v28, v28, v30\n" + ".option pop\n" + : : + [wd0]"r"(&dptr[z][d + 0 * nsize]), + [wd1]"r"(&dptr[z][d + 1 * nsize]), + [wd2]"r"(&dptr[z][d + 2 * nsize]), + [wd3]"r"(&dptr[z][d + 3 * nsize]), + [wd4]"r"(&dptr[z][d + 4 * nsize]), + [wd5]"r"(&dptr[z][d + 5 * nsize]), + [wd6]"r"(&dptr[z][d + 6 * nsize]), + [wd7]"r"(&dptr[z][d + 7 * nsize]), + [x1d]"r"(0x1d) + ); + } + + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + /* + * w2$$ = MASK(wq$$); + * w1$$ = SHLBYTE(wq$$); + * w2$$ &= NBYTES(0x1d); + * wq$$ = w1$$ ^ w2$$; + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vsra.vi v2, v1, 7\n" + "vsll.vi v3, v1, 1\n" + "vand.vx v2, v2, %[x1d]\n" + "vxor.vv v1, v3, v2\n" + + "vsra.vi v6, v5, 7\n" + "vsll.vi v7, v5, 1\n" + "vand.vx v6, v6, %[x1d]\n" + "vxor.vv v5, v7, v6\n" + + "vsra.vi v10, v9, 7\n" + "vsll.vi v11, v9, 1\n" + "vand.vx v10, v10, %[x1d]\n" + "vxor.vv v9, v11, v10\n" + + "vsra.vi v14, v13, 7\n" + "vsll.vi v15, v13, 1\n" + "vand.vx v14, v14, %[x1d]\n" + "vxor.vv v13, v15, v14\n" + + "vsra.vi v18, v17, 7\n" + "vsll.vi v19, v17, 1\n" + "vand.vx v18, v18, %[x1d]\n" + "vxor.vv v17, v19, v18\n" + + "vsra.vi v22, v21, 7\n" + "vsll.vi v23, v21, 1\n" + "vand.vx v22, v22, %[x1d]\n" + "vxor.vv v21, v23, v22\n" + + "vsra.vi v26, v25, 7\n" + "vsll.vi v27, v25, 1\n" + "vand.vx v26, v26, %[x1d]\n" + "vxor.vv v25, v27, v26\n" + + "vsra.vi v30, v29, 7\n" + "vsll.vi v31, v29, 1\n" + "vand.vx v30, v30, %[x1d]\n" + "vxor.vv v29, v31, v30\n" + ".option pop\n" + : : + [x1d]"r"(0x1d) + ); + } + + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + * v0:wp0, v1:wq0, v2:p0, v3:q0 + * v4:wp1, v5:wq1, v6:p1, v7:q1 + * v8:wp2, v9:wq2, v10:p2, v11:q2 + * v12:wp3, v13:wq3, v14:p3, v15:q3 + * v16:wp4, v17:wq4, v18:p4, v19:q4 + * v20:wp5, v21:wq5, v22:p5, v23:q5 + * v24:wp6, v25:wq6, v26:p6, v27:q6 + * v28:wp7, v29:wq7, v30:p7, v31:q7 + */ + asm volatile (".option push\n" + ".option arch,+v\n" + "vle8.v v2, (%[wp0])\n" + "vle8.v v3, (%[wq0])\n" + "vxor.vv v2, v2, v0\n" + "vxor.vv v3, v3, v1\n" + "vse8.v v2, (%[wp0])\n" + "vse8.v v3, (%[wq0])\n" + + "vle8.v v6, (%[wp1])\n" + "vle8.v v7, (%[wq1])\n" + "vxor.vv v6, v6, v4\n" + "vxor.vv v7, v7, v5\n" + "vse8.v v6, (%[wp1])\n" + "vse8.v v7, (%[wq1])\n" + + "vle8.v v10, (%[wp2])\n" + "vle8.v v11, (%[wq2])\n" + "vxor.vv v10, v10, v8\n" + "vxor.vv v11, v11, v9\n" + "vse8.v v10, (%[wp2])\n" + "vse8.v v11, (%[wq2])\n" + + "vle8.v v14, (%[wp3])\n" + "vle8.v v15, (%[wq3])\n" + "vxor.vv v14, v14, v12\n" + "vxor.vv v15, v15, v13\n" + "vse8.v v14, (%[wp3])\n" + "vse8.v v15, (%[wq3])\n" + + "vle8.v v18, (%[wp4])\n" + "vle8.v v19, (%[wq4])\n" + "vxor.vv v18, v18, v16\n" + "vxor.vv v19, v19, v17\n" + "vse8.v v18, (%[wp4])\n" + "vse8.v v19, (%[wq4])\n" + + "vle8.v v22, (%[wp5])\n" + "vle8.v v23, (%[wq5])\n" + "vxor.vv v22, v22, v20\n" + "vxor.vv v23, v23, v21\n" + "vse8.v v22, (%[wp5])\n" + "vse8.v v23, (%[wq5])\n" + + "vle8.v v26, (%[wp6])\n" + "vle8.v v27, (%[wq6])\n" + "vxor.vv v26, v26, v24\n" + "vxor.vv v27, v27, v25\n" + "vse8.v v26, (%[wp6])\n" + "vse8.v v27, (%[wq6])\n" + + "vle8.v v30, (%[wp7])\n" + "vle8.v v31, (%[wq7])\n" + "vxor.vv v30, v30, v28\n" + "vxor.vv v31, v31, v29\n" + "vse8.v v30, (%[wp7])\n" + "vse8.v v31, (%[wq7])\n" + ".option pop\n" + : : + [wp0]"r"(&p[d + nsize * 0]), + [wq0]"r"(&q[d + nsize * 0]), + [wp1]"r"(&p[d + nsize * 1]), + [wq1]"r"(&q[d + nsize * 1]), + [wp2]"r"(&p[d + nsize * 2]), + [wq2]"r"(&q[d + nsize * 2]), + [wp3]"r"(&p[d + nsize * 3]), + [wq3]"r"(&q[d + nsize * 3]), + [wp4]"r"(&p[d + nsize * 4]), + [wq4]"r"(&q[d + nsize * 4]), + [wp5]"r"(&p[d + nsize * 5]), + [wq5]"r"(&q[d + nsize * 5]), + [wp6]"r"(&p[d + nsize * 6]), + [wq6]"r"(&q[d + nsize * 6]), + [wp7]"r"(&p[d + nsize * 7]), + [wq7]"r"(&q[d + nsize * 7]) + ); + } +} + +RAID6_RVV_WRAPPER(1); +RAID6_RVV_WRAPPER(2); +RAID6_RVV_WRAPPER(4); +RAID6_RVV_WRAPPER(8); diff --git a/lib/raid/raid6/riscv/rvv.h b/lib/raid/raid6/riscv/rvv.h new file mode 100644 index 000000000000..b0a71b375962 --- /dev/null +++ b/lib/raid/raid6/riscv/rvv.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright 2024 Institute of Software, CAS. + * + * raid6/rvv.h + * + * Definitions for RISC-V RAID-6 code + */ + +#include <linux/raid/pq.h> +#include <asm/vector.h> + +static int rvv_has_vector(void) +{ + return has_vector(); +} + +#define RAID6_RVV_WRAPPER(_n) \ + static void raid6_rvv ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _gen_syndrome_real(int d, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + static void raid6_rvv ## _n ## _xor_syndrome(int disks, \ + int start, int stop, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_rvv ## _n ## _xor_syndrome_real(int d, \ + int s1, int s2, \ + unsigned long b, void **p); \ + kernel_vector_begin(); \ + raid6_rvv ## _n ## _xor_syndrome_real(disks, \ + start, stop, (unsigned long)bytes, ptrs); \ + kernel_vector_end(); \ + } \ + struct raid6_calls const raid6_rvvx ## _n = { \ + raid6_rvv ## _n ## _gen_syndrome, \ + raid6_rvv ## _n ## _xor_syndrome, \ + rvv_has_vector, \ + "rvvx" #_n, \ + 0 \ + } diff --git a/lib/raid/raid6/s390/recov_s390xc.c b/lib/raid/raid6/s390/recov_s390xc.c new file mode 100644 index 000000000000..487018f81192 --- /dev/null +++ b/lib/raid/raid6/s390/recov_s390xc.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RAID-6 data recovery in dual failure mode based on the XC instruction. + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/raid/pq.h> + +static inline void xor_block(u8 *p1, u8 *p2) +{ + typedef struct { u8 _[256]; } addrtype; + + asm volatile( + " xc 0(256,%[p1]),0(%[p2])\n" + : "+m" (*(addrtype *) p1) : "m" (*(addrtype *) p2), + [p1] "a" (p1), [p2] "a" (p2) : "cc"); +} + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_s390xc(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dp, p); + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = pbmul[dp[i]] ^ qmul[dq[i]]; + xor_block(dp, dq); + p += 256; + q += 256; + dp += 256; + dq += 256; + bytes -= 256; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_s390xc(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + int i; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while (bytes) { + xor_block(dq, q); + for (i = 0; i < 256; i++) + dq[i] = qmul[dq[i]]; + xor_block(p, dq); + p += 256; + q += 256; + dq += 256; + bytes -= 256; + } +} + + +const struct raid6_recov_calls raid6_recov_s390xc = { + .data2 = raid6_2data_recov_s390xc, + .datap = raid6_datap_recov_s390xc, + .valid = NULL, + .name = "s390xc", + .priority = 1, +}; diff --git a/lib/raid/raid6/s390/s390vx.uc b/lib/raid/raid6/s390/s390vx.uc new file mode 100644 index 000000000000..8aa53eb2f395 --- /dev/null +++ b/lib/raid/raid6/s390/s390vx.uc @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * raid6_vx$#.c + * + * $#-way unrolled RAID6 gen/xor functions for s390 + * based on the vector facility + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + * + * This file is postprocessed using unroll.awk. + */ + +#include <linux/cpufeature.h> +#include <linux/raid/pq.h> +#include <asm/fpu.h> + +#define NSIZE 16 + +static __always_inline void LOAD_CONST(void) +{ + fpu_vrepib(24, 0x07); + fpu_vrepib(25, 0x1d); +} + +/* + * The SHLBYTE() operation shifts each of the 16 bytes in + * vector register y left by 1 bit and stores the result in + * vector register x. + */ +#define SHLBYTE(x, y) fpu_vab(x, y, y) + +/* + * For each of the 16 bytes in the vector register y the MASK() + * operation returns 0xFF if the high bit of the byte is 1, + * or 0x00 if the high bit is 0. The result is stored in vector + * register x. + */ +#define MASK(x, y) fpu_vesravb(x, y, 24) + +#define AND(x, y, z) fpu_vn(x, y, z) +#define XOR(x, y, z) fpu_vx(x, y, z) +#define LOAD_DATA(x, ptr) fpu_vlm(x, x + $# - 1, ptr) +#define STORE_DATA(x, ptr) fpu_vstm(x, x + $# - 1, ptr) +#define COPY_VEC(x, y) fpu_vlr(x, y) + +static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + u8 **dptr, *p, *q; + int d, z, z0; + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + dptr = (u8 **) ptrs; + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0 + 1]; /* XOR parity */ + q = dptr[z0 + 2]; /* RS syndrome */ + + for (d = 0; d < bytes; d += $#*NSIZE) { + LOAD_DATA(0,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + STORE_DATA(0,&p[d]); + STORE_DATA(8,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + DECLARE_KERNEL_FPU_ONSTACK32(vxstate); + u8 **dptr, *p, *q; + int d, z, z0; + + dptr = (u8 **) ptrs; + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks - 2]; /* XOR parity */ + q = dptr[disks - 1]; /* RS syndrome */ + + kernel_fpu_begin(&vxstate, KERNEL_VXR); + LOAD_CONST(); + + for (d = 0; d < bytes; d += $#*NSIZE) { + /* P/Q data pages */ + LOAD_DATA(0,&dptr[z0][d]); + COPY_VEC(8+$$,0+$$); + for (z = z0 - 1; z >= start; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + LOAD_DATA(16,&dptr[z][d]); + XOR(0+$$,0+$$,16+$$); + XOR(8+$$,8+$$,16+$$); + } + /* P/Q left side optimization */ + for (z = start - 1; z >= 0; z--) { + MASK(16+$$,8+$$); + AND(16+$$,16+$$,25); + SHLBYTE(8+$$,8+$$); + XOR(8+$$,8+$$,16+$$); + } + LOAD_DATA(16,&p[d]); + XOR(16+$$,16+$$,0+$$); + STORE_DATA(16,&p[d]); + LOAD_DATA(16,&q[d]); + XOR(16+$$,16+$$,8+$$); + STORE_DATA(16,&q[d]); + } + kernel_fpu_end(&vxstate, KERNEL_VXR); +} + +static int raid6_s390vx$#_valid(void) +{ + return cpu_has_vx(); +} + +const struct raid6_calls raid6_s390vx$# = { + raid6_s390vx$#_gen_syndrome, + raid6_s390vx$#_xor_syndrome, + raid6_s390vx$#_valid, + "vx128x$#", + 1 +}; diff --git a/lib/raid/raid6/tests/Makefile b/lib/raid/raid6/tests/Makefile new file mode 100644 index 000000000000..87a001b22847 --- /dev/null +++ b/lib/raid/raid6/tests/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_RAID6_PQ_KUNIT_TEST) += raid6_kunit.o diff --git a/lib/raid/raid6/tests/raid6_kunit.c b/lib/raid/raid6/tests/raid6_kunit.c new file mode 100644 index 000000000000..9db287b4a48f --- /dev/null +++ b/lib/raid/raid6/tests/raid6_kunit.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * Test RAID-6 recovery algorithms. + */ + +#include <kunit/test.h> +#include <linux/prandom.h> +#include <linux/raid/pq.h> + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + +#define RAID6_KUNIT_SEED 42 + +#define NDISKS 16 /* Including P and Q */ + +static struct rnd_state rng; +static void *dataptrs[NDISKS]; +static char data[NDISKS][PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +static char recovi[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); +static char recovj[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + +static void makedata(int start, int stop) +{ + int i; + + for (i = start; i <= stop; i++) { + prandom_bytes_state(&rng, data[i], PAGE_SIZE); + dataptrs[i] = data[i]; + } +} + +static char member_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static void test_disks(struct kunit *test, const struct raid6_calls *calls, + const struct raid6_recov_calls *ra, int faila, int failb) +{ + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[faila] = recovi; + dataptrs[failb] = recovj; + + if (failb == NDISKS - 1) { + /* + * We don't implement the data+Q failure scenario, since it + * is equivalent to a RAID-5 failure (XOR, then recompute Q). + */ + if (faila != NDISKS - 2) + goto skip; + + /* P+Q failure. Just rebuild the syndrome. */ + calls->gen_syndrome(NDISKS, PAGE_SIZE, dataptrs); + } else if (failb == NDISKS - 2) { + /* data+P failure. */ + ra->datap(NDISKS, PAGE_SIZE, faila, dataptrs); + } else { + /* data+data failure. */ + ra->data2(NDISKS, PAGE_SIZE, faila, failb, dataptrs); + } + + KUNIT_EXPECT_MEMEQ_MSG(test, data[faila], recovi, PAGE_SIZE, + "algo=%-8s/%-8s faila miscompared: %3d[%c] (failb=%3d[%c])\n", + calls->name, ra->name, + faila, member_type(faila), + failb, member_type(failb)); + KUNIT_EXPECT_MEMEQ_MSG(test, data[failb], recovj, PAGE_SIZE, + "algo=%-8s/%-8s failb miscompared: %3d[%c] (faila=%3d[%c])\n", + calls->name, ra->name, + failb, member_type(failb), + faila, member_type(faila)); + +skip: + dataptrs[faila] = data[faila]; + dataptrs[failb] = data[failb]; +} + +static void raid6_test(struct kunit *test) +{ + const struct raid6_calls *const *algo; + const struct raid6_recov_calls *const *ra; + int i, j, p1, p2; + + for (ra = raid6_recov_algos; *ra; ra++) { + if ((*ra)->valid && !(*ra)->valid()) + continue; + + for (algo = raid6_algos; *algo; algo++) { + const struct raid6_calls *calls = *algo; + + if (calls->valid && !calls->valid()) + continue; + + /* Nuke syndromes */ + memset(data[NDISKS - 2], 0xee, PAGE_SIZE); + memset(data[NDISKS - 1], 0xee, PAGE_SIZE); + + /* Generate assumed good syndrome */ + calls->gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + test_disks(test, calls, *ra, i, j); + + if (!calls->xor_syndrome) + continue; + + for (p1 = 0; p1 < NDISKS-2; p1++) + for (p2 = p1; p2 < NDISKS-2; p2++) { + + /* Simulate rmw run */ + calls->xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); + makedata(p1, p2); + calls->xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + test_disks(test, calls, + *ra, i, j); + } + + } + } +} + +static struct kunit_case raid6_test_cases[] = { + KUNIT_CASE(raid6_test), + {}, +}; + +static int raid6_suite_init(struct kunit_suite *suite) +{ + prandom_seed_state(&rng, RAID6_KUNIT_SEED); + makedata(0, NDISKS - 1); + return 0; +} + +static struct kunit_suite raid6_test_suite = { + .name = "raid6", + .test_cases = raid6_test_cases, + .suite_init = raid6_suite_init, +}; +kunit_test_suite(raid6_test_suite); + +MODULE_DESCRIPTION("Unit test for the RAID P/Q library functions"); +MODULE_LICENSE("GPL"); diff --git a/lib/raid/raid6/unroll.awk b/lib/raid/raid6/unroll.awk new file mode 100644 index 000000000000..0809805a7e23 --- /dev/null +++ b/lib/raid/raid6/unroll.awk @@ -0,0 +1,20 @@ + +# This filter requires one command line option of form -vN=n +# where n must be a decimal number. +# +# Repeat each input line containing $$ n times, replacing $$ with 0...n-1. +# Replace each $# with n, and each $* with a single $. + +BEGIN { + n = N + 0 +} +{ + if (/\$\$/) { rep = n } else { rep = 1 } + for (i = 0; i < rep; ++i) { + tmp = $0 + gsub(/\$\$/, i, tmp) + gsub(/\$#/, n, tmp) + gsub(/\$\*/, "$", tmp) + print tmp + } +} diff --git a/lib/raid/raid6/x86/avx2.c b/lib/raid/raid6/x86/avx2.c new file mode 100644 index 000000000000..a1a5213918af --- /dev/null +++ b/lib/raid/raid6/x86/avx2.c @@ -0,0 +1,470 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 2012 Intel Corporation + * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com> + * + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * AVX2 implementation of RAID-6 syndrome functions + * + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +static const struct raid6_avx2_constants { + u64 x1d[4]; +} raid6_avx2_constants __aligned(32) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); +} + +/* + * Plain AVX2 implementation + */ +static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ + + for (d = 0; d < bytes; d += 32) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 32) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x1 = { + raid6_avx21_gen_syndrome, + raid6_avx21_xor_syndrome, + raid6_have_avx2, + "avx2x1", + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ +}; + +/* + * Unrolled-by-2 AVX2 implementation + */ +static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ + asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ + asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x2 = { + raid6_avx22_gen_syndrome, + raid6_avx22_xor_syndrome, + raid6_have_avx2, + "avx2x2", + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ + + for (d = 0; d < bytes; d += 128) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); + asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); + asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); + asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); + asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); + asm volatile("vpxor %ymm4,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm3,%ymm3"); + asm volatile("vpxor %ymm12,%ymm10,%ymm10"); + asm volatile("vpxor %ymm14,%ymm11,%ymm11"); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" + :: "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" + :: "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" + :: "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+64])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxor %ymm5,%ymm5,%ymm5"); + asm volatile("vpxor %ymm7,%ymm7,%ymm7"); + asm volatile("vpxor %ymm13,%ymm13,%ymm13"); + asm volatile("vpxor %ymm15,%ymm15,%ymm15"); + asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); + asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); + asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); + asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x4 = { + raid6_avx24_gen_syndrome, + raid6_avx24_xor_syndrome, + raid6_have_avx2, + "avx2x4", + .priority = 2 /* Prefer AVX2 over priority 1 (SSE2 and others) */ +}; +#endif /* CONFIG_X86_64 */ diff --git a/lib/raid/raid6/x86/avx512.c b/lib/raid/raid6/x86/avx512.c new file mode 100644 index 000000000000..874998bcd7d7 --- /dev/null +++ b/lib/raid/raid6/x86/avx512.c @@ -0,0 +1,560 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- -------------------------------------------------------- + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + * + * Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- + */ + +/* + * AVX512 implementation of RAID-6 syndrome functions + * + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +static const struct raid6_avx512_constants { + u64 x1d[8]; +} raid6_avx512_constants __aligned(512/8) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "prefetchnta %1\n\t" + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %1,%%zmm6" + : + : "m" (dptr[z0][d]), "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm6" + : + : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm4,%1\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4" + : + : "m" (p[d]), "m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5121_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 64) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm2\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2" + : + : "m" (dptr[z0][d]), "m" (p[d])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : "m" (dptr[z][d])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + /* Don't use movntdq for r/w memory area < cache line */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm2,%1" + : + : "m" (q[d]), "m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x1 = { + raid6_avx5121_gen_syndrome, + raid6_avx5121_xor_syndrome, + raid6_have_avx512, + "avx512x1", + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ +}; + +/* + * Unrolled-by-2 AVX512 implementation + */ +static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + /* We uniformly assume a single prefetch covers at least 64 bytes */ + for (d = 0; d < bytes; d += 128) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vmovdqa64 %0,%%zmm2\n\t" /* P[0] */ + "vmovdqa64 %1,%%zmm3\n\t" /* P[1] */ + "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */ + "vmovdqa64 %%zmm3,%%zmm6" /* Q[1] */ + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64])); + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm4,%2\n\t" + "vmovntdq %%zmm6,%3" + : + : "m" (p[d]), "m" (p[d+64]), "m" (q[d]), + "m" (q[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5122_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + : : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 128) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm2\n\t" + "vmovdqa64 %3,%%zmm3\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (p[d]), "m" (p[d+64])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64])); + } + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6" + : + : ); + } + asm volatile("vpxorq %0,%%zmm4,%%zmm4\n\t" + "vpxorq %1,%%zmm6,%%zmm6\n\t" + /* Don't use movntdq for r/w + * memory area < cache line + */ + "vmovdqa64 %%zmm4,%0\n\t" + "vmovdqa64 %%zmm6,%1\n\t" + "vmovdqa64 %%zmm2,%2\n\t" + "vmovdqa64 %%zmm3,%3" + : + : "m" (q[d]), "m" (q[d+64]), "m" (p[d]), + "m" (p[d+64])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx512x2 = { + raid6_avx5122_gen_syndrome, + raid6_avx5122_xor_syndrome, + raid6_have_avx512, + "avx512x2", + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0\n\t" + "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t" /* Zero temp */ + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" /* P[0] */ + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" /* P[1] */ + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" /* Q[0] */ + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" /* Q[1] */ + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" /* P[2] */ + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" /* P[3] */ + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" /* Q[2] */ + "vpxorq %%zmm14,%%zmm14,%%zmm14" /* Q[3] */ + : + : "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0; d < bytes; d += 256) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + "prefetchnta %2\n\t" + "prefetchnta %3\n\t" + "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), "m" (dptr[z][d+192])); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t" + "vmovntdq %%zmm14,%7\n\t" + "vpxorq %%zmm14,%%zmm14,%%zmm14" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_avx5124_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa64 %0,%%zmm0" + :: "m" (raid6_avx512_constants.x1d[0])); + + for (d = 0 ; d < bytes ; d += 256) { + asm volatile("vmovdqa64 %0,%%zmm4\n\t" + "vmovdqa64 %1,%%zmm6\n\t" + "vmovdqa64 %2,%%zmm12\n\t" + "vmovdqa64 %3,%%zmm14\n\t" + "vmovdqa64 %4,%%zmm2\n\t" + "vmovdqa64 %5,%%zmm3\n\t" + "vmovdqa64 %6,%%zmm10\n\t" + "vmovdqa64 %7,%%zmm11\n\t" + "vpxorq %%zmm4,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm6,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm12,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm14,%%zmm11,%%zmm11" + : + : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]), + "m" (dptr[z0][d+128]), "m" (dptr[z0][d+192]), + "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192])); + /* P/Q data pages */ + for (z = z0-1 ; z >= start ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "prefetchnta %0\n\t" + "prefetchnta %2\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%Zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t" + "vmovdqa64 %0,%%zmm5\n\t" + "vmovdqa64 %1,%%zmm7\n\t" + "vmovdqa64 %2,%%zmm13\n\t" + "vmovdqa64 %3,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t" + "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t" + "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t" + "vpxorq %%zmm15,%%zmm11,%%zmm11\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : "m" (dptr[z][d]), "m" (dptr[z][d+64]), + "m" (dptr[z][d+128]), + "m" (dptr[z][d+192])); + } + asm volatile("prefetchnta %0\n\t" + "prefetchnta %1\n\t" + : + : "m" (q[d]), "m" (q[d+128])); + /* P/Q left side optimization */ + for (z = start-1 ; z >= 0 ; z--) { + asm volatile("vpxorq %%zmm5,%%zmm5,%%zmm5\n\t" + "vpxorq %%zmm7,%%zmm7,%%zmm7\n\t" + "vpxorq %%zmm13,%%zmm13,%%zmm13\n\t" + "vpxorq %%zmm15,%%zmm15,%%zmm15\n\t" + "vpcmpgtb %%zmm4,%%zmm5,%%k1\n\t" + "vpcmpgtb %%zmm6,%%zmm7,%%k2\n\t" + "vpcmpgtb %%zmm12,%%zmm13,%%k3\n\t" + "vpcmpgtb %%zmm14,%%zmm15,%%k4\n\t" + "vpmovm2b %%k1,%%zmm5\n\t" + "vpmovm2b %%k2,%%zmm7\n\t" + "vpmovm2b %%k3,%%zmm13\n\t" + "vpmovm2b %%k4,%%zmm15\n\t" + "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t" + "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t" + "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t" + "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t" + "vpandq %%zmm0,%%zmm5,%%zmm5\n\t" + "vpandq %%zmm0,%%zmm7,%%zmm7\n\t" + "vpandq %%zmm0,%%zmm13,%%zmm13\n\t" + "vpandq %%zmm0,%%zmm15,%%zmm15\n\t" + "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t" + "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t" + "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t" + "vpxorq %%zmm15,%%zmm14,%%zmm14" + : + : ); + } + asm volatile("vmovntdq %%zmm2,%0\n\t" + "vmovntdq %%zmm3,%1\n\t" + "vmovntdq %%zmm10,%2\n\t" + "vmovntdq %%zmm11,%3\n\t" + "vpxorq %4,%%zmm4,%%zmm4\n\t" + "vpxorq %5,%%zmm6,%%zmm6\n\t" + "vpxorq %6,%%zmm12,%%zmm12\n\t" + "vpxorq %7,%%zmm14,%%zmm14\n\t" + "vmovntdq %%zmm4,%4\n\t" + "vmovntdq %%zmm6,%5\n\t" + "vmovntdq %%zmm12,%6\n\t" + "vmovntdq %%zmm14,%7" + : + : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]), + "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]), + "m" (q[d+128]), "m" (q[d+192])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} +const struct raid6_calls raid6_avx512x4 = { + raid6_avx5124_gen_syndrome, + raid6_avx5124_xor_syndrome, + raid6_have_avx512, + "avx512x4", + .priority = 2 /* Prefer AVX512 over priority 1 (SSE2 and others) */ +}; +#endif diff --git a/lib/raid/raid6/x86/mmx.c b/lib/raid/raid6/x86/mmx.c new file mode 100644 index 000000000000..7e9810669347 --- /dev/null +++ b/lib/raid/raid6/x86/mmx.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/mmx.c + * + * MMX implementation of RAID-6 syndrome functions + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +/* Shared with raid6/sse1.c */ +const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants = { + 0x1d1d1d1d1d1d1d1dULL, +}; + +static int raid6_have_mmx(void) +{ + /* Not really "boot_cpu" but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX); +} + +/* + * Plain MMX implementation + */ +static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("pxor %mm2,%mm2"); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("pxor %mm4,%mm4"); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx1 = { + raid6_mmx1_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_mmx, + "mmxx1", + 0 +}; + +/* + * Unrolled-by-2 MMX implementation + */ +static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx2 = { + raid6_mmx2_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_mmx, + "mmxx2", + 0 +}; diff --git a/lib/raid/raid6/x86/recov_avx2.c b/lib/raid/raid6/x86/recov_avx2.c new file mode 100644 index 000000000000..19fbd9c4dce6 --- /dev/null +++ b/lib/raid/raid6/x86/recov_avx2.c @@ -0,0 +1,313 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +static int raid6_has_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX); +} + +static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* ymm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0])); + asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32])); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32])); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[32] ^ q[32] + * 0 = dp[0] ^ p[0] + * 8 = dp[32] ^ p[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpsraw $4, %ymm9, %ymm12"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm9, %ymm9"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm9, %ymm4, %ymm14"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm12, %ymm5, %ymm15"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm14, %ymm15, %ymm15"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* + * 5 = qx[0] + * 15 = qx[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpsraw $4, %ymm8, %ymm6"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm14"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm14, %ymm4, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm13"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + asm volatile("vpxor %ymm12, %ymm13, %ymm13"); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[32]] + */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + asm volatile("vpxor %ymm15, %ymm13, %ymm13"); + + /* + * 1 = db = DQ + * 13 = db[32] = DQ[32] + */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32])); + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vpxor %ymm13, %ymm8, %ymm8"); + + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q)); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p)); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq)); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* 5 = qx */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + + /* 1 = pbmul[px] */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + /* 1 = db = DQ */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[32] ^ dq[32] + */ + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vmovapd %ymm0, %ymm13"); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + asm volatile("vmovapd %ymm1, %ymm14"); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpsraw $4, %ymm8, %ymm12"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm8"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm8, %ymm13, %ymm13"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpshufb %ymm12, %ymm14, %ymm14"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + asm volatile("vpxor %ymm13, %ymm14, %ymm14"); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[32] ^ dq[32]] + */ + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + asm volatile("vpxor %ymm14, %ymm12, %ymm12"); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[32] ^ qmul[q[32] ^ dq[32]] + */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx2 = { + .data2 = raid6_2data_recov_avx2, + .datap = raid6_datap_recov_avx2, + .valid = raid6_has_avx2, +#ifdef CONFIG_X86_64 + .name = "avx2x2", +#else + .name = "avx2x1", +#endif + .priority = 2, +}; diff --git a/lib/raid/raid6/x86/recov_avx512.c b/lib/raid/raid6/x86/recov_avx512.c new file mode 100644 index 000000000000..143f4976b2ad --- /dev/null +++ b/lib/raid/raid6/x86/recov_avx512.c @@ -0,0 +1,377 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2016 Intel Corporation + * + * Author: Gayatri Kammela <gayatri.kammela@intel.com> + * Author: Megha Dey <megha.dey@linux.intel.com> + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +static int raid6_has_avx512(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +} + +static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* zmm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm9\n\t" + "vmovdqa64 %2, %%zmm0\n\t" + "vmovdqa64 %3, %%zmm8\n\t" + "vpxorq %4, %%zmm1, %%zmm1\n\t" + "vpxorq %5, %%zmm9, %%zmm9\n\t" + "vpxorq %6, %%zmm0, %%zmm0\n\t" + "vpxorq %7, %%zmm8, %%zmm8" + : + : "m" (q[0]), "m" (q[64]), "m" (p[0]), + "m" (p[64]), "m" (dq[0]), "m" (dq[64]), + "m" (dp[0]), "m" (dp[64])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[64] ^ q[64] + * 0 = dp[0] ^ p[0] + * 8 = dp[64] ^ p[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpsraw $4, %%zmm9, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm9, %%zmm9\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* + * 5 = qx[0] + * 15 = qx[64] + */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpsraw $4, %%zmm8, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm14\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm12, %%zmm13, %%zmm13" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[64]] + */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm15, %%zmm13, %%zmm13" + : + : ); + + /* + * 1 = db = DQ + * 13 = db[64] = DQ[64] + */ + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm13,%1\n\t" + "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vpxorq %%zmm13, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64])); + + asm volatile("vmovdqa64 %%zmm0, %0\n\t" + "vmovdqa64 %%zmm8, %1" + : + : "m" (dp[0]), "m" (dp[64])); + + bytes -= 128; + p += 128; + q += 128; + dp += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm1\n\t" + "vmovdqa64 %1, %%zmm0\n\t" + "vpxorq %2, %%zmm1, %%zmm1\n\t" + "vpxorq %3, %%zmm0, %%zmm0" + : + : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm5" + : + : "m" (qmul[0]), "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm1, %%zmm1\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t" + "vpxorq %%zmm4, %%zmm5, %%zmm5" + : + : ); + + /* 5 = qx */ + + asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (pbmul[0]), "m" (pbmul[16])); + + asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t" + "vpandq %%zmm7, %%zmm0, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm2, %%zmm2\n\t" + "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t" + "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm4, %%zmm1, %%zmm1" + : + : ); + + /* 1 = pbmul[px] */ + asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t" + /* 1 = db = DQ */ + "vmovdqa64 %%zmm1, %0\n\t" + : + : "m" (dq[0])); + + asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t" + "vmovdqa64 %%zmm0, %0" + : + : "m" (dp[0])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vmovdqa64 %1, %%zmm8\n\t" + "vpxorq %2, %%zmm3, %%zmm3\n\t" + "vpxorq %3, %%zmm8, %%zmm8" + : + : "m" (dq[0]), "m" (dq[64]), "m" (q[0]), + "m" (q[64])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[64] ^ dq[64] + */ + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vmovapd %%zmm0, %%zmm13\n\t" + "vbroadcasti64x2 %1, %%zmm1\n\t" + "vmovapd %%zmm1, %%zmm14" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpsraw $4, %%zmm8, %%zmm12\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm8, %%zmm8\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm12, %%zmm12\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm13, %%zmm14, %%zmm14" + : + : ); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[64] ^ dq[64]] + */ + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vmovdqa64 %1, %%zmm12\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t" + "vpxorq %%zmm14, %%zmm12, %%zmm12" + : + : "m" (p[0]), "m" (p[64])); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[64] ^ qmul[q[64] ^ dq[64]] + */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm14, %1\n\t" + "vmovdqa64 %%zmm2, %2\n\t" + "vmovdqa64 %%zmm12,%3" + : + : "m" (dq[0]), "m" (dq[64]), "m" (p[0]), + "m" (p[64])); + + bytes -= 128; + p += 128; + q += 128; + dq += 128; +#else + asm volatile("vmovdqa64 %0, %%zmm3\n\t" + "vpxorq %1, %%zmm3, %%zmm3" + : + : "m" (dq[0]), "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t" + "vbroadcasti64x2 %1, %%zmm1" + : + : "m" (qmul[0]), "m" (qmul[16])); + + asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t" + "vpandq %%zmm7, %%zmm3, %%zmm3\n\t" + "vpandq %%zmm7, %%zmm6, %%zmm6\n\t" + "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t" + "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t" + "vpxorq %%zmm0, %%zmm1, %%zmm1" + : + : ); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %0, %%zmm2\n\t" + "vpxorq %%zmm1, %%zmm2, %%zmm2" + : + : "m" (p[0])); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa64 %%zmm1, %0\n\t" + "vmovdqa64 %%zmm2, %1" + : + : "m" (dq[0]), "m" (p[0])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx512 = { + .data2 = raid6_2data_recov_avx512, + .datap = raid6_datap_recov_avx512, + .valid = raid6_has_avx512, +#ifdef CONFIG_X86_64 + .name = "avx512x2", +#else + .name = "avx512x1", +#endif + .priority = 3, +}; diff --git a/lib/raid/raid6/x86/recov_ssse3.c b/lib/raid/raid6/x86/recov_ssse3.c new file mode 100644 index 000000000000..146cdbf465bd --- /dev/null +++ b/lib/raid/raid6/x86/recov_ssse3.c @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Intel Corporation + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +static int raid6_has_ssse3(void) +{ + return boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2) && + boot_cpu_has(X86_FEATURE_SSSE3); +} + +static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); + +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); +#endif + + /* Now do it... */ + while (bytes) { +#ifdef CONFIG_X86_64 + /* xmm6, xmm14, xmm15 */ + + asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); + asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); + asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); + asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); + asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); + asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); + asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); + asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); + + /* xmm0/8 = px */ + + asm volatile("movdqa %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + asm volatile("movdqa %xmm6,%xmm12"); + asm volatile("movdqa %xmm5,%xmm13"); + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("movdqa %xmm9,%xmm11"); + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ + asm volatile("movdqa %xmm8,%xmm10"); + asm volatile("psraw $4,%xmm1"); + asm volatile("psraw $4,%xmm9"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pand %xmm7,%xmm9"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pshufb %xmm9,%xmm13"); + asm volatile("pxor %xmm4,%xmm5"); + asm volatile("pxor %xmm12,%xmm13"); + + /* xmm5/13 = qx */ + + asm volatile("movdqa %xmm14,%xmm4"); + asm volatile("movdqa %xmm15,%xmm1"); + asm volatile("movdqa %xmm14,%xmm12"); + asm volatile("movdqa %xmm15,%xmm9"); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("movdqa %xmm10,%xmm11"); + asm volatile("psraw $4,%xmm2"); + asm volatile("psraw $4,%xmm10"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pand %xmm7,%xmm10"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pshufb %xmm10,%xmm9"); + asm volatile("pxor %xmm4,%xmm1"); + asm volatile("pxor %xmm12,%xmm9"); + + /* xmm1/9 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + asm volatile("pxor %xmm13,%xmm9"); + /* xmm1/9 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("pxor %xmm9,%xmm8"); + asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); + asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#else + asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); + asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); + asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); + asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); + + /* 1 = dq ^ q + * 0 = dp ^ p + */ + asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("psraw $4,%xmm1"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pxor %xmm4,%xmm5"); + + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ + + /* xmm5 = qx */ + + asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("psraw $4,%xmm2"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pxor %xmm4,%xmm1"); + + /* xmm1 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + /* xmm1 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + + +static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = raid6_get_zero_page(); + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + + /* xmm3 = q[0] ^ dq[0] */ + + asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm4 = q[16] ^ dq[16] */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %xmm4, %xmm8"); + + /* xmm4 = xmm8 = q[16] ^ dq[16] */ + + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); + asm volatile("pxor %xmm0, %xmm1"); + asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); + + /* xmm1 = qmul[q[0] ^ dq[0]] */ + + asm volatile("psraw $4, %xmm4"); + asm volatile("pand %xmm7, %xmm8"); + asm volatile("pand %xmm7, %xmm4"); + asm volatile("pshufb %xmm8, %xmm10"); + asm volatile("pshufb %xmm4, %xmm11"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("pxor %xmm10, %xmm11"); + asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); + + /* xmm11 = qmul[q[16] ^ dq[16]] */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ + + asm volatile("pxor %xmm11, %xmm12"); + + /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); + + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; + +#else + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm3 = *q ^ *dq */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("pxor %xmm0, %xmm1"); + + /* xmm1 = qmul[*q ^ *dq */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = *p ^ qmul[*q ^ *dq] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_ssse3 = { + .data2 = raid6_2data_recov_ssse3, + .datap = raid6_datap_recov_ssse3, + .valid = raid6_has_ssse3, +#ifdef CONFIG_X86_64 + .name = "ssse3x2", +#else + .name = "ssse3x1", +#endif + .priority = 1, +}; diff --git a/lib/raid/raid6/x86/sse1.c b/lib/raid/raid6/x86/sse1.c new file mode 100644 index 000000000000..deecdd72ceec --- /dev/null +++ b/lib/raid/raid6/x86/sse1.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse1.c + * + * SSE-1/MMXEXT implementation of RAID-6 syndrome functions + * + * This is really an MMX implementation, but it requires SSE-1 or + * AMD MMXEXT for prefetch support and a few other features. The + * support for nontemporal memory accesses is enough to make this + * worthwhile as a separate implementation. + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +/* Defined in raid6/mmx.c */ +extern const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants; + +static int raid6_have_sse1_or_mmxext(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + (boot_cpu_has(X86_FEATURE_XMM) || + boot_cpu_has(X86_FEATURE_MMXEXT)); +} + +/* + * Plain SSE1 implementation + */ +static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x1 = { + raid6_sse11_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_sse1_or_mmxext, + "sse1x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE1 implementation + */ +static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 16 bytes */ + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); + } + + asm volatile("sfence" : :: "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x2 = { + raid6_sse12_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_sse1_or_mmxext, + "sse1x2", + 1 /* Has cache hints */ +}; diff --git a/lib/raid/raid6/x86/sse2.c b/lib/raid/raid6/x86/sse2.c new file mode 100644 index 000000000000..f9edf8a8d1c4 --- /dev/null +++ b/lib/raid/raid6/x86/sse2.c @@ -0,0 +1,480 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse2.c + * + * SSE-2 implementation of RAID-6 syndrome functions + * + */ + +#include <linux/raid/pq.h> +#include <asm/fpu/api.h> + +static const struct raid6_sse_constants { + u64 x1d[2]; +} raid6_sse_constants __attribute__((aligned(16))) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, +}; + +static int raid6_have_sse2(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + boot_cpu_has(X86_FEATURE_FXSR) && + boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2); +} + +/* + * Plain SSE2 implementation + */ +static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + + +static void raid6_sse21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("pxor %xmm4,%xmm2"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm5,%xmm4"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x1 = { + raid6_sse21_gen_syndrome, + raid6_sse21_xor_syndrome, + raid6_have_sse2, + "sse2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE2 implementation + */ +static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_sse22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x2 = { + raid6_sse22_gen_syndrome, + raid6_sse22_xor_syndrome, + raid6_have_sse2, + "sse2x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 SSE2 implementation + */ +static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ + asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ + asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ + asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ + asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ + asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ + asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ + asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 64 ) { + for ( z = z0 ; z >= 0 ; z-- ) { + /* The second prefetch seems to improve performance... */ + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("pxor %xmm3,%xmm3"); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("pxor %xmm10,%xmm10"); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %xmm11,%xmm11"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("pxor %xmm6,%xmm6"); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("pxor %xmm12,%xmm12"); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + asm volatile("pxor %xmm14,%xmm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +static void raid6_sse24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 64 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); + asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); + asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + asm volatile("pxor %xmm12,%xmm10"); + asm volatile("pxor %xmm14,%xmm11"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+32])); + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); + asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + + +const struct raid6_calls raid6_sse2x4 = { + raid6_sse24_gen_syndrome, + raid6_sse24_xor_syndrome, + raid6_have_sse2, + "sse2x4", + 1 /* Has cache hints */ +}; + +#endif /* CONFIG_X86_64 */ |
