diff options
Diffstat (limited to 'tools')
310 files changed, 15155 insertions, 3146 deletions
diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h index 03cd7c19a683..d5dd96902817 100644 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ b/tools/arch/arm/include/uapi/asm/kvm.h @@ -246,6 +246,7 @@ struct kvm_vcpu_events { #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 #define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 +#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ 9 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 150416682e2c..b6c5ece4fdee 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -558,9 +558,6 @@ #define SYS_ICH_VSEIR_EL2 sys_reg(3, 4, 12, 9, 4) #define SYS_ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) -#define SYS_ICH_HCR_EL2 sys_reg(3, 4, 12, 11, 0) -#define SYS_ICH_VTR_EL2 sys_reg(3, 4, 12, 11, 1) -#define SYS_ICH_MISR_EL2 sys_reg(3, 4, 12, 11, 2) #define SYS_ICH_EISR_EL2 sys_reg(3, 4, 12, 11, 3) #define SYS_ICH_ELRSR_EL2 sys_reg(3, 4, 12, 11, 5) #define SYS_ICH_VMCR_EL2 sys_reg(3, 4, 12, 11, 7) @@ -981,10 +978,6 @@ #define SYS_MPIDR_SAFE_VAL (BIT(31)) /* GIC Hypervisor interface registers */ -/* ICH_MISR_EL2 bit definitions */ -#define ICH_MISR_EOI (1 << 0) -#define ICH_MISR_U (1 << 1) - /* ICH_LR*_EL2 bit definitions */ #define ICH_LR_VIRTUAL_ID_MASK ((1ULL << 32) - 1) @@ -999,17 +992,6 @@ #define ICH_LR_PRIORITY_SHIFT 48 #define ICH_LR_PRIORITY_MASK (0xffULL << ICH_LR_PRIORITY_SHIFT) -/* ICH_HCR_EL2 bit definitions */ -#define ICH_HCR_EN (1 << 0) -#define ICH_HCR_UIE (1 << 1) -#define ICH_HCR_NPIE (1 << 3) -#define ICH_HCR_TC (1 << 10) -#define ICH_HCR_TALL0 (1 << 11) -#define ICH_HCR_TALL1 (1 << 12) -#define ICH_HCR_TDIR (1 << 14) -#define ICH_HCR_EOIcount_SHIFT 27 -#define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) - /* ICH_VMCR_EL2 bit definitions */ #define ICH_VMCR_ACK_CTL_SHIFT 2 #define ICH_VMCR_ACK_CTL_MASK (1 << ICH_VMCR_ACK_CTL_SHIFT) @@ -1030,18 +1012,6 @@ #define ICH_VMCR_ENG1_SHIFT 1 #define ICH_VMCR_ENG1_MASK (1 << ICH_VMCR_ENG1_SHIFT) -/* ICH_VTR_EL2 bit definitions */ -#define ICH_VTR_PRI_BITS_SHIFT 29 -#define ICH_VTR_PRI_BITS_MASK (7 << ICH_VTR_PRI_BITS_SHIFT) -#define ICH_VTR_ID_BITS_SHIFT 23 -#define ICH_VTR_ID_BITS_MASK (7 << ICH_VTR_ID_BITS_SHIFT) -#define ICH_VTR_SEIS_SHIFT 22 -#define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) -#define ICH_VTR_A3V_SHIFT 21 -#define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) -#define ICH_VTR_TDS_SHIFT 19 -#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) - /* * Permission Indirection Extension (PIE) permission encodings. * Encodings with the _O suffix, have overlays applied (Permission Overlay Extension). diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index 66736ff04011..6d44f8c8a18f 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -374,6 +374,7 @@ enum { #endif }; +/* Vendor hyper call function numbers 0-63 */ #define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2) enum { @@ -384,6 +385,17 @@ enum { #endif }; +/* Vendor hyper call function numbers 64-127 */ +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2 KVM_REG_ARM_FW_FEAT_BMAP_REG(3) + +enum { + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER = 0, + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS = 1, +#ifdef __KERNEL__ + KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT, +#endif +}; + /* Device Control API on vm fd */ #define KVM_ARM_VM_SMCCC_CTRL 0 #define KVM_ARM_VM_SMCCC_FILTER 0 diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 93807b437e4d..cb1740bc3da2 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/tools/arch/x86/include/asm/asm.h b/tools/arch/x86/include/asm/asm.h index 3ad3da9a7d97..dbe39b44256b 100644 --- a/tools/arch/x86/include/asm/asm.h +++ b/tools/arch/x86/include/asm/asm.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_ASM_H #define _ASM_X86_ASM_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define __ASM_FORM(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_RAW(x, ...) x,## __VA_ARGS__ # define __ASM_FORM_COMMA(x, ...) x,## __VA_ARGS__, @@ -123,7 +123,7 @@ #ifdef __KERNEL__ /* Exception table entry */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ .pushsection "__ex_table","a" ; \ .balign 4 ; \ @@ -154,7 +154,7 @@ # define _ASM_NOKPROBE(entry) # endif -#else /* ! __ASSEMBLY__ */ +#else /* ! __ASSEMBLER__ */ # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ " .pushsection \"__ex_table\",\"a\"\n" \ @@ -186,7 +186,7 @@ */ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __KERNEL__ */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 17b6590748c0..9e3fa7942e7d 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -2,14 +2,6 @@ #ifndef _ASM_X86_CPUFEATURES_H #define _ASM_X86_CPUFEATURES_H -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#include <asm/required-features.h> -#endif - -#ifndef _ASM_X86_DISABLED_FEATURES_H -#include <asm/disabled-features.h> -#endif - /* * Defines x86 CPU feature bits */ @@ -210,7 +202,6 @@ #define X86_FEATURE_MBA ( 7*32+18) /* "mba" Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ #define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* "perfmon_v2" AMD Performance Monitoring Version 2 */ -#define X86_FEATURE_USE_IBPB ( 7*32+21) /* Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* Disable Speculative Store Bypass. */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* AMD SSBD implementation via LS_CFG MSR */ diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h deleted file mode 100644 index c492bdc97b05..000000000000 --- a/tools/arch/x86/include/asm/disabled-features.h +++ /dev/null @@ -1,161 +0,0 @@ -#ifndef _ASM_X86_DISABLED_FEATURES_H -#define _ASM_X86_DISABLED_FEATURES_H - -/* These features, although they might be available in a CPU - * will not be used because the compile options to support - * them are not present. - * - * This code allows them to be checked and disabled at - * compile time without an explicit #ifdef. Use - * cpu_feature_enabled(). - */ - -#ifdef CONFIG_X86_UMIP -# define DISABLE_UMIP 0 -#else -# define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) -#endif - -#ifdef CONFIG_X86_64 -# define DISABLE_VME (1<<(X86_FEATURE_VME & 31)) -# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) -# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) -# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) -# define DISABLE_PCID 0 -#else -# define DISABLE_VME 0 -# define DISABLE_K6_MTRR 0 -# define DISABLE_CYRIX_ARR 0 -# define DISABLE_CENTAUR_MCR 0 -# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) -#endif /* CONFIG_X86_64 */ - -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU 0 -# define DISABLE_OSPKE 0 -#else -# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ - -#ifdef CONFIG_X86_5LEVEL -# define DISABLE_LA57 0 -#else -# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) -#endif - -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION -# define DISABLE_PTI 0 -#else -# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) -#endif - -#ifdef CONFIG_MITIGATION_RETPOLINE -# define DISABLE_RETPOLINE 0 -#else -# define DISABLE_RETPOLINE ((1 << (X86_FEATURE_RETPOLINE & 31)) | \ - (1 << (X86_FEATURE_RETPOLINE_LFENCE & 31))) -#endif - -#ifdef CONFIG_MITIGATION_RETHUNK -# define DISABLE_RETHUNK 0 -#else -# define DISABLE_RETHUNK (1 << (X86_FEATURE_RETHUNK & 31)) -#endif - -#ifdef CONFIG_MITIGATION_UNRET_ENTRY -# define DISABLE_UNRET 0 -#else -# define DISABLE_UNRET (1 << (X86_FEATURE_UNRET & 31)) -#endif - -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING -# define DISABLE_CALL_DEPTH_TRACKING 0 -#else -# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) -#endif - -#ifdef CONFIG_ADDRESS_MASKING -# define DISABLE_LAM 0 -#else -# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31)) -#endif - -#ifdef CONFIG_INTEL_IOMMU_SVM -# define DISABLE_ENQCMD 0 -#else -# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31)) -#endif - -#ifdef CONFIG_X86_SGX -# define DISABLE_SGX 0 -#else -# define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) -#endif - -#ifdef CONFIG_XEN_PV -# define DISABLE_XENPV 0 -#else -# define DISABLE_XENPV (1 << (X86_FEATURE_XENPV & 31)) -#endif - -#ifdef CONFIG_INTEL_TDX_GUEST -# define DISABLE_TDX_GUEST 0 -#else -# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) -#endif - -#ifdef CONFIG_X86_USER_SHADOW_STACK -#define DISABLE_USER_SHSTK 0 -#else -#define DISABLE_USER_SHSTK (1 << (X86_FEATURE_USER_SHSTK & 31)) -#endif - -#ifdef CONFIG_X86_KERNEL_IBT -#define DISABLE_IBT 0 -#else -#define DISABLE_IBT (1 << (X86_FEATURE_IBT & 31)) -#endif - -#ifdef CONFIG_X86_FRED -# define DISABLE_FRED 0 -#else -# define DISABLE_FRED (1 << (X86_FEATURE_FRED & 31)) -#endif - -#ifdef CONFIG_KVM_AMD_SEV -#define DISABLE_SEV_SNP 0 -#else -#define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) -#endif - -/* - * Make sure to add features to the correct mask - */ -#define DISABLED_MASK0 (DISABLE_VME) -#define DISABLED_MASK1 0 -#define DISABLED_MASK2 0 -#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) -#define DISABLED_MASK4 (DISABLE_PCID) -#define DISABLED_MASK5 0 -#define DISABLED_MASK6 0 -#define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 (DISABLE_XENPV|DISABLE_TDX_GUEST) -#define DISABLED_MASK9 (DISABLE_SGX) -#define DISABLED_MASK10 0 -#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ - DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) -#define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) -#define DISABLED_MASK13 0 -#define DISABLED_MASK14 0 -#define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ - DISABLE_ENQCMD) -#define DISABLED_MASK17 0 -#define DISABLED_MASK18 (DISABLE_IBT) -#define DISABLED_MASK19 (DISABLE_SEV_SNP) -#define DISABLED_MASK20 0 -#define DISABLED_MASK21 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 3ae84c3b8e6d..dc1c1057f26e 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -25,6 +25,7 @@ #define _EFER_SVME 12 /* Enable virtualization */ #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ +#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ #define EFER_SCE (1<<_EFER_SCE) @@ -34,6 +35,7 @@ #define EFER_SVME (1<<_EFER_SVME) #define EFER_LMSLE (1<<_EFER_LMSLE) #define EFER_FFXSR (1<<_EFER_FFXSR) +#define EFER_TCE (1<<_EFER_TCE) #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) /* diff --git a/tools/arch/x86/include/asm/nops.h b/tools/arch/x86/include/asm/nops.h index 1c1b7550fa55..cd94221d8335 100644 --- a/tools/arch/x86/include/asm/nops.h +++ b/tools/arch/x86/include/asm/nops.h @@ -82,7 +82,7 @@ #define ASM_NOP7 _ASM_BYTES(BYTES_NOP7) #define ASM_NOP8 _ASM_BYTES(BYTES_NOP8) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern const unsigned char * const x86_nops[]; #endif diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index 46d7e06763c9..e0125afa53fb 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -45,7 +45,7 @@ #define ORC_TYPE_REGS 3 #define ORC_TYPE_REGS_PARTIAL 4 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include <asm/byteorder.h> /* @@ -73,6 +73,6 @@ struct orc_entry { #endif } __packed; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ORC_TYPES_H */ diff --git a/tools/arch/x86/include/asm/pvclock-abi.h b/tools/arch/x86/include/asm/pvclock-abi.h index 1436226efe3e..b9fece5fc96d 100644 --- a/tools/arch/x86/include/asm/pvclock-abi.h +++ b/tools/arch/x86/include/asm/pvclock-abi.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PVCLOCK_ABI_H #define _ASM_X86_PVCLOCK_ABI_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * These structs MUST NOT be changed. @@ -44,5 +44,5 @@ struct pvclock_wall_clock { #define PVCLOCK_GUEST_STOPPED (1 << 1) /* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */ #define PVCLOCK_COUNTS_FROM_ZERO (1 << 2) -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/tools/arch/x86/include/asm/required-features.h b/tools/arch/x86/include/asm/required-features.h deleted file mode 100644 index e9187ddd3d1f..000000000000 --- a/tools/arch/x86/include/asm/required-features.h +++ /dev/null @@ -1,105 +0,0 @@ -#ifndef _ASM_X86_REQUIRED_FEATURES_H -#define _ASM_X86_REQUIRED_FEATURES_H - -/* Define minimum CPUID feature set for kernel These bits are checked - really early to actually display a visible error message before the - kernel dies. Make sure to assign features to the proper mask! - - Some requirements that are not in CPUID yet are also in the - CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too. - - The real information is in arch/x86/Kconfig.cpu, this just converts - the CONFIGs into a bitmask */ - -#ifndef CONFIG_MATH_EMULATION -# define NEED_FPU (1<<(X86_FEATURE_FPU & 31)) -#else -# define NEED_FPU 0 -#endif - -#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -# define NEED_PAE (1<<(X86_FEATURE_PAE & 31)) -#else -# define NEED_PAE 0 -#endif - -#ifdef CONFIG_X86_CMPXCHG64 -# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) -#else -# define NEED_CX8 0 -#endif - -#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64) -# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31)) -#else -# define NEED_CMOV 0 -#endif - -# define NEED_3DNOW 0 - -#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) -# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) -#else -# define NEED_NOPL 0 -#endif - -#ifdef CONFIG_MATOM -# define NEED_MOVBE (1<<(X86_FEATURE_MOVBE & 31)) -#else -# define NEED_MOVBE 0 -#endif - -#ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT_XXL -/* Paravirtualized systems may not have PSE or PGE available */ -#define NEED_PSE 0 -#define NEED_PGE 0 -#else -#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) -#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) -#endif -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) -#define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) -#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) -#define NEED_LM (1<<(X86_FEATURE_LM & 31)) -#else -#define NEED_PSE 0 -#define NEED_MSR 0 -#define NEED_PGE 0 -#define NEED_FXSR 0 -#define NEED_XMM 0 -#define NEED_XMM2 0 -#define NEED_LM 0 -#endif - -#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\ - NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\ - NEED_XMM|NEED_XMM2) -#define SSE_MASK (NEED_XMM|NEED_XMM2) - -#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) - -#define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 (NEED_NOPL) -#define REQUIRED_MASK4 (NEED_MOVBE) -#define REQUIRED_MASK5 0 -#define REQUIRED_MASK6 0 -#define REQUIRED_MASK7 0 -#define REQUIRED_MASK8 0 -#define REQUIRED_MASK9 0 -#define REQUIRED_MASK10 0 -#define REQUIRED_MASK11 0 -#define REQUIRED_MASK12 0 -#define REQUIRED_MASK13 0 -#define REQUIRED_MASK14 0 -#define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 -#define REQUIRED_MASK17 0 -#define REQUIRED_MASK18 0 -#define REQUIRED_MASK19 0 -#define REQUIRED_MASK20 0 -#define REQUIRED_MASK21 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) - -#endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 60044b608817..8de2914e6510 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -41,7 +41,7 @@ * Missing asm support * * __GENMASK_U128() depends on _BIT128() which would not work - * in the asm code, as it shifts an 'unsigned __init128' data + * in the asm code, as it shifts an 'unsigned __int128' data * type instead of direct representation of 128 bit constants * such as long and unsigned long. The fundamental problem is * that a 128 bit constant will get silently truncated by the diff --git a/tools/include/nolibc/Makefile b/tools/include/nolibc/Makefile index a1f55fb24bb3..f9702877ac21 100644 --- a/tools/include/nolibc/Makefile +++ b/tools/include/nolibc/Makefile @@ -29,7 +29,9 @@ all_files := \ compiler.h \ crt.h \ ctype.h \ + dirent.h \ errno.h \ + limits.h \ nolibc.h \ signal.h \ stackprotector.h \ diff --git a/tools/include/nolibc/arch-mips.h b/tools/include/nolibc/arch-mips.h index 1791a8ce58da..753a8ed2cf69 100644 --- a/tools/include/nolibc/arch-mips.h +++ b/tools/include/nolibc/arch-mips.h @@ -179,6 +179,7 @@ }) /* startup code, note that it's called __start on MIPS */ +void __start(void); void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector __start(void) { __asm__ volatile ( diff --git a/tools/include/nolibc/arch-s390.h b/tools/include/nolibc/arch-s390.h index f9ab83a219b8..df4c3cc713ac 100644 --- a/tools/include/nolibc/arch-s390.h +++ b/tools/include/nolibc/arch-s390.h @@ -5,8 +5,8 @@ #ifndef _NOLIBC_ARCH_S390_H #define _NOLIBC_ARCH_S390_H -#include <asm/signal.h> -#include <asm/unistd.h> +#include <linux/signal.h> +#include <linux/unistd.h> #include "compiler.h" #include "crt.h" @@ -143,8 +143,13 @@ void __attribute__((weak, noreturn)) __nolibc_entrypoint __no_stack_protector _start(void) { __asm__ volatile ( +#ifdef __s390x__ "lgr %r2, %r15\n" /* save stack pointer to %r2, as arg1 of _start_c */ "aghi %r15, -160\n" /* allocate new stackframe */ +#else + "lr %r2, %r15\n" + "ahi %r15, -96\n" +#endif "xc 0(8,%r15), 0(%r15)\n" /* clear backchain */ "brasl %r14, _start_c\n" /* transfer to c runtime */ ); diff --git a/tools/include/nolibc/arch.h b/tools/include/nolibc/arch.h index c8f4e5d3add9..8a2c143c0fba 100644 --- a/tools/include/nolibc/arch.h +++ b/tools/include/nolibc/arch.h @@ -29,7 +29,7 @@ #include "arch-powerpc.h" #elif defined(__riscv) #include "arch-riscv.h" -#elif defined(__s390x__) +#elif defined(__s390x__) || defined(__s390__) #include "arch-s390.h" #elif defined(__loongarch__) #include "arch-loongarch.h" diff --git a/tools/include/nolibc/crt.h b/tools/include/nolibc/crt.h index bbcd5fd09806..c4b10103bbec 100644 --- a/tools/include/nolibc/crt.h +++ b/tools/include/nolibc/crt.h @@ -10,6 +10,7 @@ char **environ __attribute__((weak)); const unsigned long *_auxv __attribute__((weak)); +void _start(void); static void __stack_chk_init(void); static void exit(int); @@ -22,6 +23,7 @@ extern void (*const __init_array_end[])(int, char **, char**) __attribute__((wea extern void (*const __fini_array_start[])(void) __attribute__((weak)); extern void (*const __fini_array_end[])(void) __attribute__((weak)); +void _start_c(long *sp); __attribute__((weak,used)) void _start_c(long *sp) { diff --git a/tools/include/nolibc/dirent.h b/tools/include/nolibc/dirent.h new file mode 100644 index 000000000000..c5c30d0dd680 --- /dev/null +++ b/tools/include/nolibc/dirent.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Directory access for NOLIBC + * Copyright (C) 2025 Thomas Weißschuh <linux@weissschuh.net> + */ + +#ifndef _NOLIBC_DIRENT_H +#define _NOLIBC_DIRENT_H + +#include "stdint.h" +#include "types.h" + +#include <linux/limits.h> + +struct dirent { + ino_t d_ino; + char d_name[NAME_MAX + 1]; +}; + +/* See comment of FILE in stdio.h */ +typedef struct { + char dummy[1]; +} DIR; + +static __attribute__((unused)) +DIR *fdopendir(int fd) +{ + if (fd < 0) { + SET_ERRNO(EBADF); + return NULL; + } + return (DIR *)(intptr_t)~fd; +} + +static __attribute__((unused)) +DIR *opendir(const char *name) +{ + int fd; + + fd = open(name, O_RDONLY); + if (fd == -1) + return NULL; + return fdopendir(fd); +} + +static __attribute__((unused)) +int closedir(DIR *dirp) +{ + intptr_t i = (intptr_t)dirp; + + if (i >= 0) { + SET_ERRNO(EBADF); + return -1; + } + return close(~i); +} + +static __attribute__((unused)) +int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result) +{ + char buf[sizeof(struct linux_dirent64) + NAME_MAX + 1]; + struct linux_dirent64 *ldir = (void *)buf; + intptr_t i = (intptr_t)dirp; + int fd, ret; + + if (i >= 0) + return EBADF; + + fd = ~i; + + ret = sys_getdents64(fd, ldir, sizeof(buf)); + if (ret < 0) + return -ret; + if (ret == 0) { + *result = NULL; + return 0; + } + + /* + * getdents64() returns as many entries as fit the buffer. + * readdir() can only return one entry at a time. + * Make sure the non-returned ones are not skipped. + */ + ret = lseek(fd, ldir->d_off, SEEK_SET); + if (ret == -1) + return errno; + + entry->d_ino = ldir->d_ino; + /* the destination should always be big enough */ + strlcpy(entry->d_name, ldir->d_name, sizeof(entry->d_name)); + *result = entry; + return 0; +} + +/* make sure to include all global symbols */ +#include "nolibc.h" + +#endif /* _NOLIBC_DIRENT_H */ diff --git a/tools/include/nolibc/errno.h b/tools/include/nolibc/errno.h index a44486ff0477..1d8d8033e8ff 100644 --- a/tools/include/nolibc/errno.h +++ b/tools/include/nolibc/errno.h @@ -7,7 +7,7 @@ #ifndef _NOLIBC_ERRNO_H #define _NOLIBC_ERRNO_H -#include <asm/errno.h> +#include <linux/errno.h> #ifndef NOLIBC_IGNORE_ERRNO #define SET_ERRNO(v) do { errno = (v); } while (0) diff --git a/tools/include/nolibc/limits.h b/tools/include/nolibc/limits.h new file mode 100644 index 000000000000..306d4141f4d2 --- /dev/null +++ b/tools/include/nolibc/limits.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Shim limits.h header for NOLIBC. + * Copyright (C) 2025 Thomas Weißschuh <thomas.weissschuh@linutronix.de> + */ + +#include "nolibc.h" diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 92436b1e4441..70872401aca8 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -31,8 +31,7 @@ * - The third level is the libc call definition. It exposes the lower raw * sys_<name>() calls in a way that looks like what a libc usually does, * takes care of specific input values, and of setting errno upon error. - * There can be minor variations compared to standard libc calls. For - * example the open() call always takes 3 args here. + * There can be minor variations compared to standard libc calls. * * The errno variable is declared static and unused. This way it can be * optimized away if not used. However this means that a program made of @@ -105,6 +104,7 @@ #include "string.h" #include "time.h" #include "stackprotector.h" +#include "dirent.h" /* Used by programs to avoid std includes */ #define NOLIBC diff --git a/tools/include/nolibc/signal.h b/tools/include/nolibc/signal.h index 137552216e46..cdcc5904c51e 100644 --- a/tools/include/nolibc/signal.h +++ b/tools/include/nolibc/signal.h @@ -13,6 +13,7 @@ #include "sys.h" /* This one is not marked static as it's needed by libgcc for divide by zero */ +int raise(int signal); __attribute__((weak,unused,section(".text.nolibc_raise"))) int raise(int signal) { diff --git a/tools/include/nolibc/stackprotector.h b/tools/include/nolibc/stackprotector.h index 1d0d5259ec41..c71a2c257177 100644 --- a/tools/include/nolibc/stackprotector.h +++ b/tools/include/nolibc/stackprotector.h @@ -18,6 +18,7 @@ * triggering stack protector errors themselves */ +void __stack_chk_fail(void); __attribute__((weak,used,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail(void) { @@ -28,6 +29,7 @@ void __stack_chk_fail(void) for (;;); } +void __stack_chk_fail_local(void); __attribute__((weak,noreturn,section(".text.nolibc_stack_chk"))) void __stack_chk_fail_local(void) { diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 3892034198dd..a403351dbf60 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -350,6 +350,104 @@ int printf(const char *fmt, ...) } static __attribute__((unused)) +int vsscanf(const char *str, const char *format, va_list args) +{ + uintmax_t uval; + intmax_t ival; + int base; + char *endptr; + int matches; + int lpref; + + matches = 0; + + while (1) { + if (*format == '%') { + /* start of pattern */ + lpref = 0; + format++; + + if (*format == 'l') { + /* same as in printf() */ + lpref = 1; + format++; + if (*format == 'l') { + lpref = 2; + format++; + } + } + + if (*format == '%') { + /* literal % */ + if ('%' != *str) + goto done; + str++; + format++; + continue; + } else if (*format == 'd') { + ival = strtoll(str, &endptr, 10); + if (lpref == 0) + *va_arg(args, int *) = ival; + else if (lpref == 1) + *va_arg(args, long *) = ival; + else if (lpref == 2) + *va_arg(args, long long *) = ival; + } else if (*format == 'u' || *format == 'x' || *format == 'X') { + base = *format == 'u' ? 10 : 16; + uval = strtoull(str, &endptr, base); + if (lpref == 0) + *va_arg(args, unsigned int *) = uval; + else if (lpref == 1) + *va_arg(args, unsigned long *) = uval; + else if (lpref == 2) + *va_arg(args, unsigned long long *) = uval; + } else if (*format == 'p') { + *va_arg(args, void **) = (void *)strtoul(str, &endptr, 16); + } else { + SET_ERRNO(EILSEQ); + goto done; + } + + format++; + str = endptr; + matches++; + + } else if (*format == '\0') { + goto done; + } else if (isspace(*format)) { + /* skip spaces in format and str */ + while (isspace(*format)) + format++; + while (isspace(*str)) + str++; + } else if (*format == *str) { + /* literal match */ + format++; + str++; + } else { + if (!matches) + matches = EOF; + goto done; + } + } + +done: + return matches; +} + +static __attribute__((unused, format(scanf, 2, 3))) +int sscanf(const char *str, const char *format, ...) +{ + va_list args; + int ret; + + va_start(args, format); + ret = vsscanf(str, format, args); + va_end(args); + return ret; +} + +static __attribute__((unused)) void perror(const char *msg) { fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno); diff --git a/tools/include/nolibc/stdlib.h b/tools/include/nolibc/stdlib.h index 75aa273c23a6..86ad378ab1ea 100644 --- a/tools/include/nolibc/stdlib.h +++ b/tools/include/nolibc/stdlib.h @@ -30,6 +30,7 @@ static __attribute__((unused)) char itoa_buffer[21]; */ /* must be exported, as it's used by libgcc for various divide functions */ +void abort(void); __attribute__((weak,unused,noreturn,section(".text.nolibc_abort"))) void abort(void) { diff --git a/tools/include/nolibc/string.h b/tools/include/nolibc/string.h index 9ec9c24f38c0..ba84ab700e30 100644 --- a/tools/include/nolibc/string.h +++ b/tools/include/nolibc/string.h @@ -32,6 +32,7 @@ int memcmp(const void *s1, const void *s2, size_t n) /* might be ignored by the compiler without -ffreestanding, then found as * missing. */ +void *memmove(void *dst, const void *src, size_t len); __attribute__((weak,unused,section(".text.nolibc_memmove"))) void *memmove(void *dst, const void *src, size_t len) { @@ -56,6 +57,7 @@ void *memmove(void *dst, const void *src, size_t len) #ifndef NOLIBC_ARCH_HAS_MEMCPY /* must be exported, as it's used by libgcc on ARM */ +void *memcpy(void *dst, const void *src, size_t len); __attribute__((weak,unused,section(".text.nolibc_memcpy"))) void *memcpy(void *dst, const void *src, size_t len) { @@ -73,6 +75,7 @@ void *memcpy(void *dst, const void *src, size_t len) /* might be ignored by the compiler without -ffreestanding, then found as * missing. */ +void *memset(void *dst, int b, size_t len); __attribute__((weak,unused,section(".text.nolibc_memset"))) void *memset(void *dst, int b, size_t len) { @@ -124,6 +127,7 @@ char *strcpy(char *dst, const char *src) * thus itself, hence the asm() statement below that's meant to disable this * confusing practice. */ +size_t strlen(const char *str); __attribute__((weak,unused,section(".text.nolibc_strlen"))) size_t strlen(const char *str) { diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index d4a5c2399a66..08c1c074bec8 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -10,10 +10,10 @@ #include "std.h" /* system includes */ -#include <asm/unistd.h> -#include <asm/signal.h> /* for SIGCHLD */ -#include <asm/ioctls.h> -#include <asm/mman.h> +#include <linux/unistd.h> +#include <linux/signal.h> /* for SIGCHLD */ +#include <linux/termios.h> +#include <linux/mman.h> #include <linux/fs.h> #include <linux/loop.h> #include <linux/time.h> @@ -23,7 +23,6 @@ #include <linux/prctl.h> #include <linux/resource.h> #include <linux/utsname.h> -#include <linux/signal.h> #include "arch.h" #include "errno.h" @@ -532,20 +531,16 @@ uid_t getuid(void) /* - * int ioctl(int fd, unsigned long req, void *value); + * int ioctl(int fd, unsigned long cmd, ... arg); */ static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) +long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { - return my_syscall3(__NR_ioctl, fd, req, value); + return my_syscall3(__NR_ioctl, fd, cmd, arg); } -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - return __sysret(sys_ioctl(fd, req, value)); -} +#define ioctl(fd, cmd, arg) __sysret(sys_ioctl(fd, cmd, (unsigned long)(arg))) /* * int kill(pid_t pid, int signal); @@ -602,9 +597,36 @@ off_t sys_lseek(int fd, off_t offset, int whence) } static __attribute__((unused)) +int sys_llseek(int fd, unsigned long offset_high, unsigned long offset_low, + __kernel_loff_t *result, int whence) +{ +#ifdef __NR_llseek + return my_syscall5(__NR_llseek, fd, offset_high, offset_low, result, whence); +#else + return __nolibc_enosys(__func__, fd, offset_high, offset_low, result, whence); +#endif +} + +static __attribute__((unused)) off_t lseek(int fd, off_t offset, int whence) { - return __sysret(sys_lseek(fd, offset, whence)); + __kernel_loff_t loff = 0; + off_t result; + int ret; + + result = sys_lseek(fd, offset, whence); + if (result == -ENOSYS) { + /* Only exists on 32bit where nolibc off_t is also 32bit */ + ret = sys_llseek(fd, 0, offset, &loff, whence); + if (ret < 0) + result = ret; + else if (loff != (off_t)loff) + result = -EOVERFLOW; + else + result = loff; + } + + return __sysret(result); } @@ -742,6 +764,31 @@ int mount(const char *src, const char *tgt, return __sysret(sys_mount(src, tgt, fst, flags, data)); } +/* + * int openat(int dirfd, const char *path, int flags[, mode_t mode]); + */ + +static __attribute__((unused)) +int sys_openat(int dirfd, const char *path, int flags, mode_t mode) +{ + return my_syscall4(__NR_openat, dirfd, path, flags, mode); +} + +static __attribute__((unused)) +int openat(int dirfd, const char *path, int flags, ...) +{ + mode_t mode = 0; + + if (flags & O_CREAT) { + va_list args; + + va_start(args, flags); + mode = va_arg(args, mode_t); + va_end(args); + } + + return __sysret(sys_openat(dirfd, path, flags, mode)); +} /* * int open(const char *path, int flags[, mode_t mode]); @@ -750,13 +797,7 @@ int mount(const char *src, const char *tgt, static __attribute__((unused)) int sys_open(const char *path, int flags, mode_t mode) { -#ifdef __NR_openat return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); -#elif defined(__NR_open) - return my_syscall3(__NR_open, path, flags, mode); -#else - return __nolibc_enosys(__func__, path, flags, mode); -#endif } static __attribute__((unused)) @@ -768,7 +809,7 @@ int open(const char *path, int flags, ...) va_list args; va_start(args, flags); - mode = va_arg(args, int); + mode = va_arg(args, mode_t); va_end(args); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2acf9b336371..defa5bb881f4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6913,6 +6913,12 @@ enum { BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; +enum { + SK_BPF_CB_TX_TIMESTAMPING = 1<<0, + SK_BPF_CB_MASK = (SK_BPF_CB_TX_TIMESTAMPING - 1) | + SK_BPF_CB_TX_TIMESTAMPING +}; + /* List of known BPF sock_ops operators. * New entries can only be added at the end */ @@ -7025,6 +7031,29 @@ enum { * by the kernel or the * earlier bpf-progs. */ + BPF_SOCK_OPS_TSTAMP_SCHED_CB, /* Called when skb is passing + * through dev layer when + * SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SND_SW_CB, /* Called when skb is about to send + * to the nic when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SND_HW_CB, /* Called in hardware phase when + * SK_BPF_CB_TX_TIMESTAMPING feature + * is on. + */ + BPF_SOCK_OPS_TSTAMP_ACK_CB, /* Called when all the skbs in the + * same sendmsg call are acked + * when SK_BPF_CB_TX_TIMESTAMPING + * feature is on. + */ + BPF_SOCK_OPS_TSTAMP_SENDMSG_CB, /* Called when every sendmsg syscall + * is triggered. It's used to correlate + * sendmsg timestamp with corresponding + * tskey. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -7091,6 +7120,7 @@ enum { TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ + SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ }; enum { diff --git a/tools/include/uapi/linux/const.h b/tools/include/uapi/linux/const.h index e16be0d37746..b8f629ef135f 100644 --- a/tools/include/uapi/linux/const.h +++ b/tools/include/uapi/linux/const.h @@ -33,7 +33,7 @@ * Missing asm support * * __BIT128() would not work in the asm code, as it shifts an - * 'unsigned __init128' data type as direct representation of + * 'unsigned __int128' data type as direct representation of * 128 bit constants is not supported in the gcc compiler, as * they get silently truncated. * diff --git a/tools/include/uapi/linux/elf.h b/tools/include/uapi/linux/elf.h new file mode 100644 index 000000000000..5834b83d7f9a --- /dev/null +++ b/tools/include/uapi/linux/elf.h @@ -0,0 +1,524 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _LINUX_ELF_H +#define _LINUX_ELF_H + +#include <linux/types.h> +#include <linux/elf-em.h> + +/* 32-bit ELF base types. */ +typedef __u32 Elf32_Addr; +typedef __u16 Elf32_Half; +typedef __u32 Elf32_Off; +typedef __s32 Elf32_Sword; +typedef __u32 Elf32_Word; +typedef __u16 Elf32_Versym; + +/* 64-bit ELF base types. */ +typedef __u64 Elf64_Addr; +typedef __u16 Elf64_Half; +typedef __s16 Elf64_SHalf; +typedef __u64 Elf64_Off; +typedef __s32 Elf64_Sword; +typedef __u32 Elf64_Word; +typedef __u64 Elf64_Xword; +typedef __s64 Elf64_Sxword; +typedef __u16 Elf64_Versym; + +/* These constants are for the segment types stored in the image headers */ +#define PT_NULL 0 +#define PT_LOAD 1 +#define PT_DYNAMIC 2 +#define PT_INTERP 3 +#define PT_NOTE 4 +#define PT_SHLIB 5 +#define PT_PHDR 6 +#define PT_TLS 7 /* Thread local storage segment */ +#define PT_LOOS 0x60000000 /* OS-specific */ +#define PT_HIOS 0x6fffffff /* OS-specific */ +#define PT_LOPROC 0x70000000 +#define PT_HIPROC 0x7fffffff +#define PT_GNU_EH_FRAME (PT_LOOS + 0x474e550) +#define PT_GNU_STACK (PT_LOOS + 0x474e551) +#define PT_GNU_RELRO (PT_LOOS + 0x474e552) +#define PT_GNU_PROPERTY (PT_LOOS + 0x474e553) + + +/* ARM MTE memory tag segment type */ +#define PT_AARCH64_MEMTAG_MTE (PT_LOPROC + 0x2) + +/* + * Extended Numbering + * + * If the real number of program header table entries is larger than + * or equal to PN_XNUM(0xffff), it is set to sh_info field of the + * section header at index 0, and PN_XNUM is set to e_phnum + * field. Otherwise, the section header at index 0 is zero + * initialized, if it exists. + * + * Specifications are available in: + * + * - Oracle: Linker and Libraries. + * Part No: 817–1984–19, August 2011. + * https://docs.oracle.com/cd/E18752_01/pdf/817-1984.pdf + * + * - System V ABI AMD64 Architecture Processor Supplement + * Draft Version 0.99.4, + * January 13, 2010. + * http://www.cs.washington.edu/education/courses/cse351/12wi/supp-docs/abi.pdf + */ +#define PN_XNUM 0xffff + +/* These constants define the different elf file types */ +#define ET_NONE 0 +#define ET_REL 1 +#define ET_EXEC 2 +#define ET_DYN 3 +#define ET_CORE 4 +#define ET_LOPROC 0xff00 +#define ET_HIPROC 0xffff + +/* This is the info that is needed to parse the dynamic section of the file */ +#define DT_NULL 0 +#define DT_NEEDED 1 +#define DT_PLTRELSZ 2 +#define DT_PLTGOT 3 +#define DT_HASH 4 +#define DT_STRTAB 5 +#define DT_SYMTAB 6 +#define DT_RELA 7 +#define DT_RELASZ 8 +#define DT_RELAENT 9 +#define DT_STRSZ 10 +#define DT_SYMENT 11 +#define DT_INIT 12 +#define DT_FINI 13 +#define DT_SONAME 14 +#define DT_RPATH 15 +#define DT_SYMBOLIC 16 +#define DT_REL 17 +#define DT_RELSZ 18 +#define DT_RELENT 19 +#define DT_PLTREL 20 +#define DT_DEBUG 21 +#define DT_TEXTREL 22 +#define DT_JMPREL 23 +#define DT_ENCODING 32 +#define OLD_DT_LOOS 0x60000000 +#define DT_LOOS 0x6000000d +#define DT_HIOS 0x6ffff000 +#define DT_VALRNGLO 0x6ffffd00 +#define DT_VALRNGHI 0x6ffffdff +#define DT_ADDRRNGLO 0x6ffffe00 +#define DT_GNU_HASH 0x6ffffef5 +#define DT_ADDRRNGHI 0x6ffffeff +#define DT_VERSYM 0x6ffffff0 +#define DT_RELACOUNT 0x6ffffff9 +#define DT_RELCOUNT 0x6ffffffa +#define DT_FLAGS_1 0x6ffffffb +#define DT_VERDEF 0x6ffffffc +#define DT_VERDEFNUM 0x6ffffffd +#define DT_VERNEED 0x6ffffffe +#define DT_VERNEEDNUM 0x6fffffff +#define OLD_DT_HIOS 0x6fffffff +#define DT_LOPROC 0x70000000 +#define DT_HIPROC 0x7fffffff + +/* This info is needed when parsing the symbol table */ +#define STB_LOCAL 0 +#define STB_GLOBAL 1 +#define STB_WEAK 2 + +#define STN_UNDEF 0 + +#define STT_NOTYPE 0 +#define STT_OBJECT 1 +#define STT_FUNC 2 +#define STT_SECTION 3 +#define STT_FILE 4 +#define STT_COMMON 5 +#define STT_TLS 6 + +#define VER_FLG_BASE 0x1 +#define VER_FLG_WEAK 0x2 + +#define ELF_ST_BIND(x) ((x) >> 4) +#define ELF_ST_TYPE(x) ((x) & 0xf) +#define ELF32_ST_BIND(x) ELF_ST_BIND(x) +#define ELF32_ST_TYPE(x) ELF_ST_TYPE(x) +#define ELF64_ST_BIND(x) ELF_ST_BIND(x) +#define ELF64_ST_TYPE(x) ELF_ST_TYPE(x) + +typedef struct { + Elf32_Sword d_tag; + union { + Elf32_Sword d_val; + Elf32_Addr d_ptr; + } d_un; +} Elf32_Dyn; + +typedef struct { + Elf64_Sxword d_tag; /* entry tag value */ + union { + Elf64_Xword d_val; + Elf64_Addr d_ptr; + } d_un; +} Elf64_Dyn; + +/* The following are used with relocations */ +#define ELF32_R_SYM(x) ((x) >> 8) +#define ELF32_R_TYPE(x) ((x) & 0xff) + +#define ELF64_R_SYM(i) ((i) >> 32) +#define ELF64_R_TYPE(i) ((i) & 0xffffffff) + +typedef struct elf32_rel { + Elf32_Addr r_offset; + Elf32_Word r_info; +} Elf32_Rel; + +typedef struct elf64_rel { + Elf64_Addr r_offset; /* Location at which to apply the action */ + Elf64_Xword r_info; /* index and type of relocation */ +} Elf64_Rel; + +typedef struct elf32_rela { + Elf32_Addr r_offset; + Elf32_Word r_info; + Elf32_Sword r_addend; +} Elf32_Rela; + +typedef struct elf64_rela { + Elf64_Addr r_offset; /* Location at which to apply the action */ + Elf64_Xword r_info; /* index and type of relocation */ + Elf64_Sxword r_addend; /* Constant addend used to compute value */ +} Elf64_Rela; + +typedef struct elf32_sym { + Elf32_Word st_name; + Elf32_Addr st_value; + Elf32_Word st_size; + unsigned char st_info; + unsigned char st_other; + Elf32_Half st_shndx; +} Elf32_Sym; + +typedef struct elf64_sym { + Elf64_Word st_name; /* Symbol name, index in string tbl */ + unsigned char st_info; /* Type and binding attributes */ + unsigned char st_other; /* No defined meaning, 0 */ + Elf64_Half st_shndx; /* Associated section index */ + Elf64_Addr st_value; /* Value of the symbol */ + Elf64_Xword st_size; /* Associated symbol size */ +} Elf64_Sym; + + +#define EI_NIDENT 16 + +typedef struct elf32_hdr { + unsigned char e_ident[EI_NIDENT]; + Elf32_Half e_type; + Elf32_Half e_machine; + Elf32_Word e_version; + Elf32_Addr e_entry; /* Entry point */ + Elf32_Off e_phoff; + Elf32_Off e_shoff; + Elf32_Word e_flags; + Elf32_Half e_ehsize; + Elf32_Half e_phentsize; + Elf32_Half e_phnum; + Elf32_Half e_shentsize; + Elf32_Half e_shnum; + Elf32_Half e_shstrndx; +} Elf32_Ehdr; + +typedef struct elf64_hdr { + unsigned char e_ident[EI_NIDENT]; /* ELF "magic number" */ + Elf64_Half e_type; + Elf64_Half e_machine; + Elf64_Word e_version; + Elf64_Addr e_entry; /* Entry point virtual address */ + Elf64_Off e_phoff; /* Program header table file offset */ + Elf64_Off e_shoff; /* Section header table file offset */ + Elf64_Word e_flags; + Elf64_Half e_ehsize; + Elf64_Half e_phentsize; + Elf64_Half e_phnum; + Elf64_Half e_shentsize; + Elf64_Half e_shnum; + Elf64_Half e_shstrndx; +} Elf64_Ehdr; + +/* These constants define the permissions on sections in the program + header, p_flags. */ +#define PF_R 0x4 +#define PF_W 0x2 +#define PF_X 0x1 + +typedef struct elf32_phdr { + Elf32_Word p_type; + Elf32_Off p_offset; + Elf32_Addr p_vaddr; + Elf32_Addr p_paddr; + Elf32_Word p_filesz; + Elf32_Word p_memsz; + Elf32_Word p_flags; + Elf32_Word p_align; +} Elf32_Phdr; + +typedef struct elf64_phdr { + Elf64_Word p_type; + Elf64_Word p_flags; + Elf64_Off p_offset; /* Segment file offset */ + Elf64_Addr p_vaddr; /* Segment virtual address */ + Elf64_Addr p_paddr; /* Segment physical address */ + Elf64_Xword p_filesz; /* Segment size in file */ + Elf64_Xword p_memsz; /* Segment size in memory */ + Elf64_Xword p_align; /* Segment alignment, file & memory */ +} Elf64_Phdr; + +/* sh_type */ +#define SHT_NULL 0 +#define SHT_PROGBITS 1 +#define SHT_SYMTAB 2 +#define SHT_STRTAB 3 +#define SHT_RELA 4 +#define SHT_HASH 5 +#define SHT_DYNAMIC 6 +#define SHT_NOTE 7 +#define SHT_NOBITS 8 +#define SHT_REL 9 +#define SHT_SHLIB 10 +#define SHT_DYNSYM 11 +#define SHT_NUM 12 +#define SHT_LOPROC 0x70000000 +#define SHT_HIPROC 0x7fffffff +#define SHT_LOUSER 0x80000000 +#define SHT_HIUSER 0xffffffff + +/* sh_flags */ +#define SHF_WRITE 0x1 +#define SHF_ALLOC 0x2 +#define SHF_EXECINSTR 0x4 +#define SHF_RELA_LIVEPATCH 0x00100000 +#define SHF_RO_AFTER_INIT 0x00200000 +#define SHF_MASKPROC 0xf0000000 + +/* special section indexes */ +#define SHN_UNDEF 0 +#define SHN_LORESERVE 0xff00 +#define SHN_LOPROC 0xff00 +#define SHN_HIPROC 0xff1f +#define SHN_LIVEPATCH 0xff20 +#define SHN_ABS 0xfff1 +#define SHN_COMMON 0xfff2 +#define SHN_HIRESERVE 0xffff + +typedef struct elf32_shdr { + Elf32_Word sh_name; + Elf32_Word sh_type; + Elf32_Word sh_flags; + Elf32_Addr sh_addr; + Elf32_Off sh_offset; + Elf32_Word sh_size; + Elf32_Word sh_link; + Elf32_Word sh_info; + Elf32_Word sh_addralign; + Elf32_Word sh_entsize; +} Elf32_Shdr; + +typedef struct elf64_shdr { + Elf64_Word sh_name; /* Section name, index in string tbl */ + Elf64_Word sh_type; /* Type of section */ + Elf64_Xword sh_flags; /* Miscellaneous section attributes */ + Elf64_Addr sh_addr; /* Section virtual addr at execution */ + Elf64_Off sh_offset; /* Section file offset */ + Elf64_Xword sh_size; /* Size of section in bytes */ + Elf64_Word sh_link; /* Index of another section */ + Elf64_Word sh_info; /* Additional section information */ + Elf64_Xword sh_addralign; /* Section alignment */ + Elf64_Xword sh_entsize; /* Entry size if section holds table */ +} Elf64_Shdr; + +#define EI_MAG0 0 /* e_ident[] indexes */ +#define EI_MAG1 1 +#define EI_MAG2 2 +#define EI_MAG3 3 +#define EI_CLASS 4 +#define EI_DATA 5 +#define EI_VERSION 6 +#define EI_OSABI 7 +#define EI_PAD 8 + +#define ELFMAG0 0x7f /* EI_MAG */ +#define ELFMAG1 'E' +#define ELFMAG2 'L' +#define ELFMAG3 'F' +#define ELFMAG "\177ELF" +#define SELFMAG 4 + +#define ELFCLASSNONE 0 /* EI_CLASS */ +#define ELFCLASS32 1 +#define ELFCLASS64 2 +#define ELFCLASSNUM 3 + +#define ELFDATANONE 0 /* e_ident[EI_DATA] */ +#define ELFDATA2LSB 1 +#define ELFDATA2MSB 2 + +#define EV_NONE 0 /* e_version, EI_VERSION */ +#define EV_CURRENT 1 +#define EV_NUM 2 + +#define ELFOSABI_NONE 0 +#define ELFOSABI_LINUX 3 + +#ifndef ELF_OSABI +#define ELF_OSABI ELFOSABI_NONE +#endif + +/* + * Notes used in ET_CORE. Architectures export some of the arch register sets + * using the corresponding note types via the PTRACE_GETREGSET and + * PTRACE_SETREGSET requests. + * The note name for these types is "LINUX", except NT_PRFPREG that is named + * "CORE". + */ +#define NT_PRSTATUS 1 +#define NT_PRFPREG 2 +#define NT_PRPSINFO 3 +#define NT_TASKSTRUCT 4 +#define NT_AUXV 6 +/* + * Note to userspace developers: size of NT_SIGINFO note may increase + * in the future to accomodate more fields, don't assume it is fixed! + */ +#define NT_SIGINFO 0x53494749 +#define NT_FILE 0x46494c45 +#define NT_PRXFPREG 0x46e62b7f /* copied from gdb5.1/include/elf/common.h */ +#define NT_PPC_VMX 0x100 /* PowerPC Altivec/VMX registers */ +#define NT_PPC_SPE 0x101 /* PowerPC SPE/EVR registers */ +#define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ +#define NT_PPC_TAR 0x103 /* Target Address Register */ +#define NT_PPC_PPR 0x104 /* Program Priority Register */ +#define NT_PPC_DSCR 0x105 /* Data Stream Control Register */ +#define NT_PPC_EBB 0x106 /* Event Based Branch Registers */ +#define NT_PPC_PMU 0x107 /* Performance Monitor Registers */ +#define NT_PPC_TM_CGPR 0x108 /* TM checkpointed GPR Registers */ +#define NT_PPC_TM_CFPR 0x109 /* TM checkpointed FPR Registers */ +#define NT_PPC_TM_CVMX 0x10a /* TM checkpointed VMX Registers */ +#define NT_PPC_TM_CVSX 0x10b /* TM checkpointed VSX Registers */ +#define NT_PPC_TM_SPR 0x10c /* TM Special Purpose Registers */ +#define NT_PPC_TM_CTAR 0x10d /* TM checkpointed Target Address Register */ +#define NT_PPC_TM_CPPR 0x10e /* TM checkpointed Program Priority Register */ +#define NT_PPC_TM_CDSCR 0x10f /* TM checkpointed Data Stream Control Register */ +#define NT_PPC_PKEY 0x110 /* Memory Protection Keys registers */ +#define NT_PPC_DEXCR 0x111 /* PowerPC DEXCR registers */ +#define NT_PPC_HASHKEYR 0x112 /* PowerPC HASHKEYR register */ +#define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ +#define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ +/* Old binutils treats 0x203 as a CET state */ +#define NT_X86_SHSTK 0x204 /* x86 SHSTK state */ +#define NT_X86_XSAVE_LAYOUT 0x205 /* XSAVE layout description */ +#define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ +#define NT_S390_TIMER 0x301 /* s390 timer register */ +#define NT_S390_TODCMP 0x302 /* s390 TOD clock comparator register */ +#define NT_S390_TODPREG 0x303 /* s390 TOD programmable register */ +#define NT_S390_CTRS 0x304 /* s390 control registers */ +#define NT_S390_PREFIX 0x305 /* s390 prefix register */ +#define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ +#define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ +#define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ +#define NT_S390_RI_CB 0x30d /* s390 runtime instrumentation */ +#define NT_S390_PV_CPU_DATA 0x30e /* s390 protvirt cpu dump data */ +#define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ +#define NT_ARM_TLS 0x401 /* ARM TLS register */ +#define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ +#define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ +#define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ +#define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ +#define NT_ARM_TAGGED_ADDR_CTRL 0x409 /* arm64 tagged address control (prctl()) */ +#define NT_ARM_PAC_ENABLED_KEYS 0x40a /* arm64 ptr auth enabled keys (prctl()) */ +#define NT_ARM_SSVE 0x40b /* ARM Streaming SVE registers */ +#define NT_ARM_ZA 0x40c /* ARM SME ZA registers */ +#define NT_ARM_ZT 0x40d /* ARM SME ZT registers */ +#define NT_ARM_FPMR 0x40e /* ARM floating point mode register */ +#define NT_ARM_POE 0x40f /* ARM POE registers */ +#define NT_ARM_GCS 0x410 /* ARM GCS state */ +#define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ +#define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ +#define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ +#define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ +#define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ +#define NT_RISCV_CSR 0x900 /* RISC-V Control and Status Registers */ +#define NT_RISCV_VECTOR 0x901 /* RISC-V vector registers */ +#define NT_RISCV_TAGGED_ADDR_CTRL 0x902 /* RISC-V tagged address control (prctl()) */ +#define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ +#define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ +#define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ +#define NT_LOONGARCH_LASX 0xa03 /* LoongArch Loongson Advanced SIMD Extension registers */ +#define NT_LOONGARCH_LBT 0xa04 /* LoongArch Loongson Binary Translation registers */ +#define NT_LOONGARCH_HW_BREAK 0xa05 /* LoongArch hardware breakpoint registers */ +#define NT_LOONGARCH_HW_WATCH 0xa06 /* LoongArch hardware watchpoint registers */ + +/* Note types with note name "GNU" */ +#define NT_GNU_PROPERTY_TYPE_0 5 + +/* Note header in a PT_NOTE section */ +typedef struct elf32_note { + Elf32_Word n_namesz; /* Name size */ + Elf32_Word n_descsz; /* Content size */ + Elf32_Word n_type; /* Content type */ +} Elf32_Nhdr; + +/* Note header in a PT_NOTE section */ +typedef struct elf64_note { + Elf64_Word n_namesz; /* Name size */ + Elf64_Word n_descsz; /* Content size */ + Elf64_Word n_type; /* Content type */ +} Elf64_Nhdr; + +/* .note.gnu.property types for EM_AARCH64: */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 + +/* Bits for GNU_PROPERTY_AARCH64_FEATURE_1_BTI */ +#define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) + +typedef struct { + Elf32_Half vd_version; + Elf32_Half vd_flags; + Elf32_Half vd_ndx; + Elf32_Half vd_cnt; + Elf32_Word vd_hash; + Elf32_Word vd_aux; + Elf32_Word vd_next; +} Elf32_Verdef; + +typedef struct { + Elf64_Half vd_version; + Elf64_Half vd_flags; + Elf64_Half vd_ndx; + Elf64_Half vd_cnt; + Elf64_Word vd_hash; + Elf64_Word vd_aux; + Elf64_Word vd_next; +} Elf64_Verdef; + +typedef struct { + Elf32_Word vda_name; + Elf32_Word vda_next; +} Elf32_Verdaux; + +typedef struct { + Elf64_Word vda_name; + Elf64_Word vda_next; +} Elf64_Verdaux; + +#endif /* _LINUX_ELF_H */ diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 42ec5ddaab8d..42869770776e 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -127,6 +127,12 @@ struct xdp_options { */ #define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) +/* Request launch time hardware offload. The device will schedule the packet for + * transmission at a pre-determined time called launch time. The value of + * launch time is communicated via launch_time field of struct xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_LAUNCH_TIME (1 << 2) + /* AF_XDP offloads request. 'request' union member is consumed by the driver * when the packet is being transmitted. 'completion' union member is * filled by the driver when the transmit completion arrives. @@ -142,6 +148,10 @@ struct xsk_tx_metadata { __u16 csum_start; /* Offset from csum_start where checksum should be stored. */ __u16 csum_offset; + + /* XDP_TXMD_FLAGS_LAUNCH_TIME */ + /* Launch time in nanosecond against the PTP HW Clock */ + __u64 launch_time; } request; struct { diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index e4be227d3ad6..7600bf62dbdf 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -59,10 +59,13 @@ enum netdev_xdp_rx_metadata { * by the driver. * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the * driver. + * @NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO: Launch time HW offload is supported + * by the driver. */ enum netdev_xsk_flags { NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, + NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO = 4, }; enum netdev_queue_type { @@ -87,6 +90,11 @@ enum { }; enum { + __NETDEV_A_IO_URING_PROVIDER_INFO_MAX, + NETDEV_A_IO_URING_PROVIDER_INFO_MAX = (__NETDEV_A_IO_URING_PROVIDER_INFO_MAX - 1) +}; + +enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, NETDEV_A_PAGE_POOL_NAPI_ID, @@ -94,6 +102,7 @@ enum { NETDEV_A_PAGE_POOL_INFLIGHT_MEM, NETDEV_A_PAGE_POOL_DETACH_TIME, NETDEV_A_PAGE_POOL_DMABUF, + NETDEV_A_PAGE_POOL_IO_URING, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) @@ -131,11 +140,18 @@ enum { }; enum { + __NETDEV_A_XSK_INFO_MAX, + NETDEV_A_XSK_INFO_MAX = (__NETDEV_A_XSK_INFO_MAX - 1) +}; + +enum { NETDEV_A_QUEUE_ID = 1, NETDEV_A_QUEUE_IFINDEX, NETDEV_A_QUEUE_TYPE, NETDEV_A_QUEUE_NAPI_ID, NETDEV_A_QUEUE_DMABUF, + NETDEV_A_QUEUE_IO_URING, + NETDEV_A_QUEUE_XSK, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt index 6f3d16dbf467..7ead94bffa4e 100644 --- a/tools/memory-model/Documentation/glossary.txt +++ b/tools/memory-model/Documentation/glossary.txt @@ -15,14 +15,14 @@ Address Dependency: When the address of a later memory access is computed 3 do_something(p->a); 4 rcu_read_unlock(); - In this case, because the address of "p->a" on line 3 is computed - from the value returned by the rcu_dereference() on line 2, the - address dependency extends from that rcu_dereference() to that - "p->a". In rare cases, optimizing compilers can destroy address - dependencies. Please see Documentation/RCU/rcu_dereference.rst - for more information. + In this case, because the address of "p->a" on line 3 is computed + from the value returned by the rcu_dereference() on line 2, the + address dependency extends from that rcu_dereference() to that + "p->a". In rare cases, optimizing compilers can destroy address + dependencies. Please see Documentation/RCU/rcu_dereference.rst + for more information. - See also "Control Dependency" and "Data Dependency". + See also "Control Dependency" and "Data Dependency". Acquire: With respect to a lock, acquiring that lock, for example, using spin_lock(). With respect to a non-lock shared variable, @@ -59,12 +59,12 @@ Control Dependency: When a later store's execution depends on a test 1 if (READ_ONCE(x)) 2 WRITE_ONCE(y, 1); - Here, the control dependency extends from the READ_ONCE() on - line 1 to the WRITE_ONCE() on line 2. Control dependencies are - fragile, and can be easily destroyed by optimizing compilers. - Please see control-dependencies.txt for more information. + Here, the control dependency extends from the READ_ONCE() on + line 1 to the WRITE_ONCE() on line 2. Control dependencies are + fragile, and can be easily destroyed by optimizing compilers. + Please see control-dependencies.txt for more information. - See also "Address Dependency" and "Data Dependency". + See also "Address Dependency" and "Data Dependency". Cycle: Memory-barrier pairing is restricted to a pair of CPUs, as the name suggests. And in a great many cases, a pair of CPUs is all @@ -72,10 +72,10 @@ Cycle: Memory-barrier pairing is restricted to a pair of CPUs, as the extended to additional CPUs, and the result is called a "cycle". In a cycle, each CPU's ordering interacts with that of the next: - CPU 0 CPU 1 CPU 2 - WRITE_ONCE(x, 1); WRITE_ONCE(y, 1); WRITE_ONCE(z, 1); - smp_mb(); smp_mb(); smp_mb(); - r0 = READ_ONCE(y); r1 = READ_ONCE(z); r2 = READ_ONCE(x); + CPU 0 CPU 1 CPU 2 + WRITE_ONCE(x, 1); WRITE_ONCE(y, 1); WRITE_ONCE(z, 1); + smp_mb(); smp_mb(); smp_mb(); + r0 = READ_ONCE(y); r1 = READ_ONCE(z); r2 = READ_ONCE(x); CPU 0's smp_mb() interacts with that of CPU 1, which interacts with that of CPU 2, which in turn interacts with that of CPU 0 diff --git a/tools/memory-model/Documentation/herd-representation.txt b/tools/memory-model/Documentation/herd-representation.txt index ed988906f2b7..4e19b4f2a476 100644 --- a/tools/memory-model/Documentation/herd-representation.txt +++ b/tools/memory-model/Documentation/herd-representation.txt @@ -18,6 +18,11 @@ # # By convention, a blank line in a cell means "same as the preceding line". # +# Note that the syntactic representation does not always match the sets and +# relations in linux-kernel.cat, due to redefinitions in linux-kernel.bell and +# lock.cat. For example, the po link between LKR and LKW is upgraded to an rmw +# link, and W[ACQUIRE] are not included in the Acquire set. +# # Disclaimer. The table includes representations of "add" and "and" operations; # corresponding/identical representations of "sub", "inc", "dec" and "or", "xor", # "andnot" operations are omitted. @@ -27,16 +32,16 @@ ------------------------------------------------------------------------------ | Non-RMW ops | | ------------------------------------------------------------------------------ - | READ_ONCE | R[once] | + | READ_ONCE | R[ONCE] | | atomic_read | | - | WRITE_ONCE | W[once] | + | WRITE_ONCE | W[ONCE] | | atomic_set | | - | smp_load_acquire | R[acquire] | + | smp_load_acquire | R[ACQUIRE] | | atomic_read_acquire | | - | smp_store_release | W[release] | + | smp_store_release | W[RELEASE] | | atomic_set_release | | - | smp_store_mb | W[once] ->po F[mb] | - | smp_mb | F[mb] | + | smp_store_mb | W[ONCE] ->po F[MB] | + | smp_mb | F[MB] | | smp_rmb | F[rmb] | | smp_wmb | F[wmb] | | smp_mb__before_atomic | F[before-atomic] | @@ -49,8 +54,8 @@ | rcu_read_lock | F[rcu-lock] | | rcu_read_unlock | F[rcu-unlock] | | synchronize_rcu | F[sync-rcu] | - | rcu_dereference | R[once] | - | rcu_assign_pointer | W[release] | + | rcu_dereference | R[ONCE] | + | rcu_assign_pointer | W[RELEASE] | | srcu_read_lock | R[srcu-lock] | | srcu_down_read | | | srcu_read_unlock | W[srcu-unlock] | @@ -60,32 +65,31 @@ ------------------------------------------------------------------------------ | RMW ops w/o return value | | ------------------------------------------------------------------------------ - | atomic_add | R*[noreturn] ->rmw W*[once] | + | atomic_add | R*[NORETURN] ->rmw W*[NORETURN] | | atomic_and | | | spin_lock | LKR ->po LKW | ------------------------------------------------------------------------------ | RMW ops w/ return value | | ------------------------------------------------------------------------------ - | atomic_add_return | F[mb] ->po R*[once] | - | | ->rmw W*[once] ->po F[mb] | + | atomic_add_return | R*[MB] ->rmw W*[MB] | | atomic_fetch_add | | | atomic_fetch_and | | | atomic_xchg | | | xchg | | | atomic_add_negative | | - | atomic_add_return_relaxed | R*[once] ->rmw W*[once] | + | atomic_add_return_relaxed | R*[ONCE] ->rmw W*[ONCE] | | atomic_fetch_add_relaxed | | | atomic_fetch_and_relaxed | | | atomic_xchg_relaxed | | | xchg_relaxed | | | atomic_add_negative_relaxed | | - | atomic_add_return_acquire | R*[acquire] ->rmw W*[once] | + | atomic_add_return_acquire | R*[ACQUIRE] ->rmw W*[ACQUIRE] | | atomic_fetch_add_acquire | | | atomic_fetch_and_acquire | | | atomic_xchg_acquire | | | xchg_acquire | | | atomic_add_negative_acquire | | - | atomic_add_return_release | R*[once] ->rmw W*[release] | + | atomic_add_return_release | R*[RELEASE] ->rmw W*[RELEASE] | | atomic_fetch_add_release | | | atomic_fetch_and_release | | | atomic_xchg_release | | @@ -94,17 +98,16 @@ ------------------------------------------------------------------------------ | Conditional RMW ops | | ------------------------------------------------------------------------------ - | atomic_cmpxchg | On success: F[mb] ->po R*[once] | - | | ->rmw W*[once] ->po F[mb] | - | | On failure: R*[once] | + | atomic_cmpxchg | On success: R*[MB] ->rmw W*[MB] | + | | On failure: R*[MB] | | cmpxchg | | | atomic_add_unless | | - | atomic_cmpxchg_relaxed | On success: R*[once] ->rmw W*[once] | - | | On failure: R*[once] | - | atomic_cmpxchg_acquire | On success: R*[acquire] ->rmw W*[once] | - | | On failure: R*[once] | - | atomic_cmpxchg_release | On success: R*[once] ->rmw W*[release] | - | | On failure: R*[once] | + | atomic_cmpxchg_relaxed | On success: R*[ONCE] ->rmw W*[ONCE] | + | | On failure: R*[ONCE] | + | atomic_cmpxchg_acquire | On success: R*[ACQUIRE] ->rmw W*[ACQUIRE] | + | | On failure: R*[ACQUIRE] | + | atomic_cmpxchg_release | On success: R*[RELEASE] ->rmw W*[RELEASE] | + | | On failure: R*[RELEASE] | | spin_trylock | On success: LKR ->po LKW | | | On failure: LF | ------------------------------------------------------------------------------ diff --git a/tools/memory-model/README b/tools/memory-model/README index dab38904206a..64c860863aa9 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -20,7 +20,7 @@ that litmus test to be exercised within the Linux kernel. REQUIREMENTS ============ -Version 7.52 or higher of the "herd7" and "klitmus7" tools must be +Version 7.58 or higher of the "herd7" and "klitmus7" tools must be downloaded separately: https://github.com/herd/herdtools7 @@ -79,7 +79,7 @@ Several thousand more example litmus tests are available here: https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/herd https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/perfbook.git/tree/CodeSamples/formal/litmus -Documentation describing litmus tests and now to use them may be found +Documentation describing litmus tests and how to use them may be found here: tools/memory-model/Documentation/litmus-tests.txt diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index ce068700939c..fe65998002b9 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -13,17 +13,18 @@ "Linux-kernel memory consistency model" -enum Accesses = 'once (*READ_ONCE,WRITE_ONCE*) || - 'release (*smp_store_release*) || - 'acquire (*smp_load_acquire*) || - 'noreturn (* R of non-return RMW *) -instructions R[{'once,'acquire,'noreturn}] -instructions W[{'once,'release}] -instructions RMW[{'once,'acquire,'release}] +enum Accesses = 'ONCE (*READ_ONCE,WRITE_ONCE*) || + 'RELEASE (*smp_store_release*) || + 'ACQUIRE (*smp_load_acquire*) || + 'NORETURN (* R of non-return RMW *) || + 'MB (*xchg(),cmpxchg(),...*) +instructions R[Accesses] +instructions W[Accesses] +instructions RMW[Accesses] enum Barriers = 'wmb (*smp_wmb*) || 'rmb (*smp_rmb*) || - 'mb (*smp_mb*) || + 'MB (*smp_mb*) || 'barrier (*barrier*) || 'rcu-lock (*rcu_read_lock*) || 'rcu-unlock (*rcu_read_unlock*) || @@ -35,6 +36,17 @@ enum Barriers = 'wmb (*smp_wmb*) || 'after-srcu-read-unlock (*smp_mb__after_srcu_read_unlock*) instructions F[Barriers] + +(* + * Filter out syntactic annotations that do not provide the corresponding + * semantic ordering, such as Acquire on a store or Mb on a failed RMW. + *) +let FailedRMW = RMW \ (domain(rmw) | range(rmw)) +let Acquire = ACQUIRE \ W \ FailedRMW +let Release = RELEASE \ R \ FailedRMW +let Mb = MB \ FailedRMW +let Noreturn = NORETURN \ W + (* SRCU *) enum SRCU = 'srcu-lock || 'srcu-unlock || 'sync-srcu instructions SRCU[SRCU] @@ -73,7 +85,7 @@ flag ~empty rcu-rscs & (po ; [Sync-srcu] ; po) as invalid-sleep flag ~empty different-values(srcu-rscs) as srcu-bad-value-match (* Compute marked and plain memory accesses *) -let Marked = (~M) | IW | Once | Release | Acquire | domain(rmw) | range(rmw) | +let Marked = (~M) | IW | ONCE | RELEASE | ACQUIRE | MB | RMW | LKR | LKW | UL | LF | RL | RU | Srcu-lock | Srcu-unlock let Plain = M \ Marked @@ -82,3 +94,6 @@ let carry-dep = (data ; [~ Srcu-unlock] ; rfi)* let addr = carry-dep ; addr let ctrl = carry-dep ; ctrl let data = carry-dep ; data + +flag ~empty (if "lkmmv2" then 0 else _) + as this-model-requires-variant-higher-than-lkmmv1 diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index adf3c4f41229..d7e7bf13c831 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -34,6 +34,16 @@ let R4rmb = R \ Noreturn (* Reads for which rmb works *) let rmb = [R4rmb] ; fencerel(Rmb) ; [R4rmb] let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | + (* + * full-barrier RMWs (successful cmpxchg(), xchg(), etc.) act as + * though there were enclosed by smp_mb(). + * The effect of these virtual smp_mb() is formalized by adding + * Mb tags to the read and write of the operation, and providing + * the same ordering as though there were additional po edges + * between the Mb tag and the read resp. write. + *) + ([M] ; po ; [Mb & R]) | + ([Mb & W] ; po ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | diff --git a/tools/memory-model/linux-kernel.cfg b/tools/memory-model/linux-kernel.cfg index 3c8098e99f41..69b04f3aad73 100644 --- a/tools/memory-model/linux-kernel.cfg +++ b/tools/memory-model/linux-kernel.cfg @@ -1,6 +1,7 @@ macros linux-kernel.def bell linux-kernel.bell model linux-kernel.cat +variant lkmmv2 graph columns squished true showevents noregs diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def index 88a39601f525..49e402782e49 100644 --- a/tools/memory-model/linux-kernel.def +++ b/tools/memory-model/linux-kernel.def @@ -6,18 +6,18 @@ // which appeared in ASPLOS 2018. // ONCE -READ_ONCE(X) __load{once}(X) -WRITE_ONCE(X,V) { __store{once}(X,V); } +READ_ONCE(X) __load{ONCE}(X) +WRITE_ONCE(X,V) { __store{ONCE}(X,V); } // Release Acquire and friends -smp_store_release(X,V) { __store{release}(*X,V); } -smp_load_acquire(X) __load{acquire}(*X) -rcu_assign_pointer(X,V) { __store{release}(X,V); } -rcu_dereference(X) __load{once}(X) -smp_store_mb(X,V) { __store{once}(X,V); __fence{mb}; } +smp_store_release(X,V) { __store{RELEASE}(*X,V); } +smp_load_acquire(X) __load{ACQUIRE}(*X) +rcu_assign_pointer(X,V) { __store{RELEASE}(X,V); } +rcu_dereference(X) __load{ONCE}(X) +smp_store_mb(X,V) { __store{ONCE}(X,V); __fence{MB}; } // Fences -smp_mb() { __fence{mb}; } +smp_mb() { __fence{MB}; } smp_rmb() { __fence{rmb}; } smp_wmb() { __fence{wmb}; } smp_mb__before_atomic() { __fence{before-atomic}; } @@ -28,14 +28,14 @@ smp_mb__after_srcu_read_unlock() { __fence{after-srcu-read-unlock}; } barrier() { __fence{barrier}; } // Exchange -xchg(X,V) __xchg{mb}(X,V) -xchg_relaxed(X,V) __xchg{once}(X,V) -xchg_release(X,V) __xchg{release}(X,V) -xchg_acquire(X,V) __xchg{acquire}(X,V) -cmpxchg(X,V,W) __cmpxchg{mb}(X,V,W) -cmpxchg_relaxed(X,V,W) __cmpxchg{once}(X,V,W) -cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W) -cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W) +xchg(X,V) __xchg{MB}(X,V) +xchg_relaxed(X,V) __xchg{ONCE}(X,V) +xchg_release(X,V) __xchg{RELEASE}(X,V) +xchg_acquire(X,V) __xchg{ACQUIRE}(X,V) +cmpxchg(X,V,W) __cmpxchg{MB}(X,V,W) +cmpxchg_relaxed(X,V,W) __cmpxchg{ONCE}(X,V,W) +cmpxchg_acquire(X,V,W) __cmpxchg{ACQUIRE}(X,V,W) +cmpxchg_release(X,V,W) __cmpxchg{RELEASE}(X,V,W) // Spinlocks spin_lock(X) { __lock(X); } @@ -63,57 +63,86 @@ atomic_set(X,V) { WRITE_ONCE(*X,V); } atomic_read_acquire(X) smp_load_acquire(X) atomic_set_release(X,V) { smp_store_release(X,V); } -atomic_add(V,X) { __atomic_op(X,+,V); } -atomic_sub(V,X) { __atomic_op(X,-,V); } -atomic_inc(X) { __atomic_op(X,+,1); } -atomic_dec(X) { __atomic_op(X,-,1); } - -atomic_add_return(V,X) __atomic_op_return{mb}(X,+,V) -atomic_add_return_relaxed(V,X) __atomic_op_return{once}(X,+,V) -atomic_add_return_acquire(V,X) __atomic_op_return{acquire}(X,+,V) -atomic_add_return_release(V,X) __atomic_op_return{release}(X,+,V) -atomic_fetch_add(V,X) __atomic_fetch_op{mb}(X,+,V) -atomic_fetch_add_relaxed(V,X) __atomic_fetch_op{once}(X,+,V) -atomic_fetch_add_acquire(V,X) __atomic_fetch_op{acquire}(X,+,V) -atomic_fetch_add_release(V,X) __atomic_fetch_op{release}(X,+,V) - -atomic_inc_return(X) __atomic_op_return{mb}(X,+,1) -atomic_inc_return_relaxed(X) __atomic_op_return{once}(X,+,1) -atomic_inc_return_acquire(X) __atomic_op_return{acquire}(X,+,1) -atomic_inc_return_release(X) __atomic_op_return{release}(X,+,1) -atomic_fetch_inc(X) __atomic_fetch_op{mb}(X,+,1) -atomic_fetch_inc_relaxed(X) __atomic_fetch_op{once}(X,+,1) -atomic_fetch_inc_acquire(X) __atomic_fetch_op{acquire}(X,+,1) -atomic_fetch_inc_release(X) __atomic_fetch_op{release}(X,+,1) - -atomic_sub_return(V,X) __atomic_op_return{mb}(X,-,V) -atomic_sub_return_relaxed(V,X) __atomic_op_return{once}(X,-,V) -atomic_sub_return_acquire(V,X) __atomic_op_return{acquire}(X,-,V) -atomic_sub_return_release(V,X) __atomic_op_return{release}(X,-,V) -atomic_fetch_sub(V,X) __atomic_fetch_op{mb}(X,-,V) -atomic_fetch_sub_relaxed(V,X) __atomic_fetch_op{once}(X,-,V) -atomic_fetch_sub_acquire(V,X) __atomic_fetch_op{acquire}(X,-,V) -atomic_fetch_sub_release(V,X) __atomic_fetch_op{release}(X,-,V) - -atomic_dec_return(X) __atomic_op_return{mb}(X,-,1) -atomic_dec_return_relaxed(X) __atomic_op_return{once}(X,-,1) -atomic_dec_return_acquire(X) __atomic_op_return{acquire}(X,-,1) -atomic_dec_return_release(X) __atomic_op_return{release}(X,-,1) -atomic_fetch_dec(X) __atomic_fetch_op{mb}(X,-,1) -atomic_fetch_dec_relaxed(X) __atomic_fetch_op{once}(X,-,1) -atomic_fetch_dec_acquire(X) __atomic_fetch_op{acquire}(X,-,1) -atomic_fetch_dec_release(X) __atomic_fetch_op{release}(X,-,1) - -atomic_xchg(X,V) __xchg{mb}(X,V) -atomic_xchg_relaxed(X,V) __xchg{once}(X,V) -atomic_xchg_release(X,V) __xchg{release}(X,V) -atomic_xchg_acquire(X,V) __xchg{acquire}(X,V) -atomic_cmpxchg(X,V,W) __cmpxchg{mb}(X,V,W) -atomic_cmpxchg_relaxed(X,V,W) __cmpxchg{once}(X,V,W) -atomic_cmpxchg_acquire(X,V,W) __cmpxchg{acquire}(X,V,W) -atomic_cmpxchg_release(X,V,W) __cmpxchg{release}(X,V,W) - -atomic_sub_and_test(V,X) __atomic_op_return{mb}(X,-,V) == 0 -atomic_dec_and_test(X) __atomic_op_return{mb}(X,-,1) == 0 -atomic_inc_and_test(X) __atomic_op_return{mb}(X,+,1) == 0 -atomic_add_negative(V,X) __atomic_op_return{mb}(X,+,V) < 0 +atomic_add(V,X) { __atomic_op{NORETURN}(X,+,V); } +atomic_sub(V,X) { __atomic_op{NORETURN}(X,-,V); } +atomic_and(V,X) { __atomic_op{NORETURN}(X,&,V); } +atomic_or(V,X) { __atomic_op{NORETURN}(X,|,V); } +atomic_xor(V,X) { __atomic_op{NORETURN}(X,^,V); } +atomic_inc(X) { __atomic_op{NORETURN}(X,+,1); } +atomic_dec(X) { __atomic_op{NORETURN}(X,-,1); } +atomic_andnot(V,X) { __atomic_op{NORETURN}(X,&~,V); } + +atomic_add_return(V,X) __atomic_op_return{MB}(X,+,V) +atomic_add_return_relaxed(V,X) __atomic_op_return{ONCE}(X,+,V) +atomic_add_return_acquire(V,X) __atomic_op_return{ACQUIRE}(X,+,V) +atomic_add_return_release(V,X) __atomic_op_return{RELEASE}(X,+,V) +atomic_fetch_add(V,X) __atomic_fetch_op{MB}(X,+,V) +atomic_fetch_add_relaxed(V,X) __atomic_fetch_op{ONCE}(X,+,V) +atomic_fetch_add_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,+,V) +atomic_fetch_add_release(V,X) __atomic_fetch_op{RELEASE}(X,+,V) + +atomic_fetch_and(V,X) __atomic_fetch_op{MB}(X,&,V) +atomic_fetch_and_relaxed(V,X) __atomic_fetch_op{ONCE}(X,&,V) +atomic_fetch_and_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,&,V) +atomic_fetch_and_release(V,X) __atomic_fetch_op{RELEASE}(X,&,V) + +atomic_fetch_or(V,X) __atomic_fetch_op{MB}(X,|,V) +atomic_fetch_or_relaxed(V,X) __atomic_fetch_op{ONCE}(X,|,V) +atomic_fetch_or_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,|,V) +atomic_fetch_or_release(V,X) __atomic_fetch_op{RELEASE}(X,|,V) + +atomic_fetch_xor(V,X) __atomic_fetch_op{MB}(X,^,V) +atomic_fetch_xor_relaxed(V,X) __atomic_fetch_op{ONCE}(X,^,V) +atomic_fetch_xor_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,^,V) +atomic_fetch_xor_release(V,X) __atomic_fetch_op{RELEASE}(X,^,V) + +atomic_inc_return(X) __atomic_op_return{MB}(X,+,1) +atomic_inc_return_relaxed(X) __atomic_op_return{ONCE}(X,+,1) +atomic_inc_return_acquire(X) __atomic_op_return{ACQUIRE}(X,+,1) +atomic_inc_return_release(X) __atomic_op_return{RELEASE}(X,+,1) +atomic_fetch_inc(X) __atomic_fetch_op{MB}(X,+,1) +atomic_fetch_inc_relaxed(X) __atomic_fetch_op{ONCE}(X,+,1) +atomic_fetch_inc_acquire(X) __atomic_fetch_op{ACQUIRE}(X,+,1) +atomic_fetch_inc_release(X) __atomic_fetch_op{RELEASE}(X,+,1) + +atomic_sub_return(V,X) __atomic_op_return{MB}(X,-,V) +atomic_sub_return_relaxed(V,X) __atomic_op_return{ONCE}(X,-,V) +atomic_sub_return_acquire(V,X) __atomic_op_return{ACQUIRE}(X,-,V) +atomic_sub_return_release(V,X) __atomic_op_return{RELEASE}(X,-,V) +atomic_fetch_sub(V,X) __atomic_fetch_op{MB}(X,-,V) +atomic_fetch_sub_relaxed(V,X) __atomic_fetch_op{ONCE}(X,-,V) +atomic_fetch_sub_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,-,V) +atomic_fetch_sub_release(V,X) __atomic_fetch_op{RELEASE}(X,-,V) + +atomic_dec_return(X) __atomic_op_return{MB}(X,-,1) +atomic_dec_return_relaxed(X) __atomic_op_return{ONCE}(X,-,1) +atomic_dec_return_acquire(X) __atomic_op_return{ACQUIRE}(X,-,1) +atomic_dec_return_release(X) __atomic_op_return{RELEASE}(X,-,1) +atomic_fetch_dec(X) __atomic_fetch_op{MB}(X,-,1) +atomic_fetch_dec_relaxed(X) __atomic_fetch_op{ONCE}(X,-,1) +atomic_fetch_dec_acquire(X) __atomic_fetch_op{ACQUIRE}(X,-,1) +atomic_fetch_dec_release(X) __atomic_fetch_op{RELEASE}(X,-,1) + +atomic_xchg(X,V) __xchg{MB}(X,V) +atomic_xchg_relaxed(X,V) __xchg{ONCE}(X,V) +atomic_xchg_release(X,V) __xchg{RELEASE}(X,V) +atomic_xchg_acquire(X,V) __xchg{ACQUIRE}(X,V) +atomic_cmpxchg(X,V,W) __cmpxchg{MB}(X,V,W) +atomic_cmpxchg_relaxed(X,V,W) __cmpxchg{ONCE}(X,V,W) +atomic_cmpxchg_acquire(X,V,W) __cmpxchg{ACQUIRE}(X,V,W) +atomic_cmpxchg_release(X,V,W) __cmpxchg{RELEASE}(X,V,W) + +atomic_sub_and_test(V,X) __atomic_op_return{MB}(X,-,V) == 0 +atomic_dec_and_test(X) __atomic_op_return{MB}(X,-,1) == 0 +atomic_inc_and_test(X) __atomic_op_return{MB}(X,+,1) == 0 +atomic_add_negative(V,X) __atomic_op_return{MB}(X,+,V) < 0 +atomic_add_negative_relaxed(V,X) __atomic_op_return{ONCE}(X,+,V) < 0 +atomic_add_negative_acquire(V,X) __atomic_op_return{ACQUIRE}(X,+,V) < 0 +atomic_add_negative_release(V,X) __atomic_op_return{RELEASE}(X,+,V) < 0 + +atomic_fetch_andnot(V,X) __atomic_fetch_op{MB}(X,&~,V) +atomic_fetch_andnot_acquire(V,X) __atomic_fetch_op{ACQUIRE}(X,&~,V) +atomic_fetch_andnot_release(V,X) __atomic_fetch_op{RELEASE}(X,&~,V) +atomic_fetch_andnot_relaxed(V,X) __atomic_fetch_op{ONCE}(X,&~,V) + +atomic_add_unless(X,V,W) __atomic_add_unless{MB}(X,V,W) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 0712b5e82eb7..f3269ce39e5b 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -17,10 +17,13 @@ get_hdr_inc=-D$(1) -include $(UAPI_PATH)/linux/$(2) CFLAGS_devlink:=$(call get_hdr_inc,_LINUX_DEVLINK_H_,devlink.h) CFLAGS_dpll:=$(call get_hdr_inc,_LINUX_DPLL_H,dpll.h) CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ - $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) + $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ + $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) CFLAGS_mptcp_pm:=$(call get_hdr_inc,_LINUX_MPTCP_PM_H,mptcp_pm.h) +CFLAGS_net_shaper:=$(call get_hdr_inc,_LINUX_NET_SHAPER_H,net_shaper.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) +CFLAGS_nl80211:=$(call get_hdr_inc,__LINUX_NL802121_H,nl80211.h) CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h) CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) diff --git a/tools/net/ynl/pyynl/lib/ynl.py b/tools/net/ynl/pyynl/lib/ynl.py index 08f8bf89cfc2..dcc2c6b298d6 100644 --- a/tools/net/ynl/pyynl/lib/ynl.py +++ b/tools/net/ynl/pyynl/lib/ynl.py @@ -536,9 +536,11 @@ class YnlFamily(SpecFamily): try: return int(value) except (ValueError, TypeError) as e: - if 'enum' not in attr_spec: - raise e - return self._encode_enum(attr_spec, value) + if 'enum' in attr_spec: + return self._encode_enum(attr_spec, value) + if attr_spec.display_hint: + return self._from_string(value, attr_spec) + raise e def _add_attr(self, space, name, value, search_attrs): try: @@ -571,7 +573,10 @@ class YnlFamily(SpecFamily): if isinstance(value, bytes): attr_payload = value elif isinstance(value, str): - attr_payload = bytes.fromhex(value) + if attr.display_hint: + attr_payload = self._from_string(value, attr) + else: + attr_payload = bytes.fromhex(value) elif isinstance(value, dict) and attr.struct_name: attr_payload = self._encode_struct(attr.struct_name, value) else: @@ -627,6 +632,11 @@ class YnlFamily(SpecFamily): decoded = self._decode_struct(attr.raw, attr_spec.struct_name) elif attr_spec.sub_type: decoded = attr.as_c_array(attr_spec.sub_type) + if 'enum' in attr_spec: + decoded = [ self._decode_enum(x, attr_spec) for x in decoded ] + elif attr_spec.display_hint: + decoded = [ self._formatted_string(x, attr_spec.display_hint) + for x in decoded ] else: decoded = attr.as_bin() if attr_spec.display_hint: @@ -644,15 +654,17 @@ class YnlFamily(SpecFamily): subattrs = self._decode(NlAttrs(item.raw), attr_spec['nested-attributes']) decoded.append({ item.type: subattrs }) elif attr_spec["sub-type"] == 'binary': - subattrs = item.as_bin() + subattr = item.as_bin() if attr_spec.display_hint: - subattrs = self._formatted_string(subattrs, attr_spec.display_hint) - decoded.append(subattrs) + subattr = self._formatted_string(subattr, attr_spec.display_hint) + decoded.append(subattr) elif attr_spec["sub-type"] in NlAttr.type_formats: - subattrs = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order) - if attr_spec.display_hint: - subattrs = self._formatted_string(subattrs, attr_spec.display_hint) - decoded.append(subattrs) + subattr = item.as_scalar(attr_spec['sub-type'], attr_spec.byte_order) + if 'enum' in attr_spec: + subattr = self._decode_enum(subattr, attr_spec) + elif attr_spec.display_hint: + subattr = self._formatted_string(subattr, attr_spec.display_hint) + decoded.append(subattr) else: raise Exception(f'Unknown {attr_spec["sub-type"]} with name {attr_spec["name"]}') return decoded @@ -899,6 +911,18 @@ class YnlFamily(SpecFamily): formatted = raw return formatted + def _from_string(self, string, attr_spec): + if attr_spec.display_hint in ['ipv4', 'ipv6']: + ip = ipaddress.ip_address(string) + if attr_spec['type'] == 'binary': + raw = ip.packed + else: + raw = int(ip) + else: + raise Exception(f"Display hint '{attr_spec.display_hint}' not implemented" + f" when parsing '{attr_spec['name']}'") + return raw + def handle_ntf(self, decoded): msg = dict() if self.include_raw: diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index c2eabc90dce8..a1427c537030 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -74,6 +74,8 @@ class Type(SpecAttr): self.c_name = c_lower(self.name) if self.c_name in _C_KW: self.c_name += '_' + if self.c_name[0].isdigit(): + self.c_name = '_' + self.c_name # Added by resolve(): self.enum_name = None @@ -100,7 +102,7 @@ class Type(SpecAttr): if isinstance(value, int): return value if value in self.family.consts: - raise Exception("Resolving family constants not implemented, yet") + return self.family.consts[value]["value"] return limit_to_number(value) def get_limit_str(self, limit, default=None, suffix=''): @@ -110,6 +112,9 @@ class Type(SpecAttr): if isinstance(value, int): return str(value) + suffix if value in self.family.consts: + const = self.family.consts[value] + if const.get('header'): + return c_upper(value) return c_upper(f"{self.family['name']}-{value}") return c_upper(value) @@ -683,7 +688,10 @@ class TypeArrayNest(Type): raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet") def _attr_typol(self): - return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, ' + if self.attr['sub-type'] in scalars: + return f'.type = YNL_PT_U{c_upper(self.sub_type[1:])}, ' + else: + return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, ' def _attr_get(self, ri, var): local_vars = ['const struct nlattr *attr2;'] @@ -885,7 +893,7 @@ class AttrSet(SpecAttrSet): elif elem['type'] == 'nest': t = TypeNest(self.family, self, elem, value) elif elem['type'] == 'indexed-array' and 'sub-type' in elem: - if elem["sub-type"] == 'nest': + if elem["sub-type"] in ['nest', 'u32']: t = TypeArrayNest(self.family, self, elem, value) else: raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}') @@ -1437,7 +1445,7 @@ class CodeWriter: self._ifdef_block = config_option -scalars = {'u8', 'u16', 'u32', 'u64', 's32', 's64', 'uint', 'sint'} +scalars = {'u8', 'u16', 'u32', 'u64', 's8', 's16', 's32', 's64', 'uint', 'sint'} direction_to_suffix = { 'reply': '_rsp', @@ -1669,6 +1677,9 @@ def _multi_parse(ri, struct, init_lines, local_vars): if aspec["sub-type"] == 'nest': local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') array_nests.add(arg) + elif aspec['sub-type'] in scalars: + local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') + array_nests.add(arg) else: raise Exception(f'Not supported sub-type {aspec["sub-type"]}') if 'multi-attr' in aspec: @@ -1724,11 +1735,17 @@ def _multi_parse(ri, struct, init_lines, local_vars): ri.cw.p(f"dst->{aspec.c_name} = calloc(n_{aspec.c_name}, sizeof(*dst->{aspec.c_name}));") ri.cw.p(f"dst->n_{aspec.c_name} = n_{aspec.c_name};") ri.cw.p('i = 0;') - ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;") + if 'nested-attributes' in aspec: + ri.cw.p(f"parg.rsp_policy = &{aspec.nested_render_name}_nest;") ri.cw.block_start(line=f"ynl_attr_for_each_nested(attr, attr_{aspec.c_name})") - ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];") - ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr, ynl_attr_type(attr)))") - ri.cw.p('return YNL_PARSE_CB_ERROR;') + if 'nested-attributes' in aspec: + ri.cw.p(f"parg.data = &dst->{aspec.c_name}[i];") + ri.cw.p(f"if ({aspec.nested_render_name}_parse(&parg, attr, ynl_attr_type(attr)))") + ri.cw.p('return YNL_PARSE_CB_ERROR;') + elif aspec.sub_type in scalars: + ri.cw.p(f"dst->{aspec.c_name}[i] = ynl_attr_get_{aspec.sub_type}(attr);") + else: + raise Exception(f"Nest parsing type not supported in {aspec['name']}") ri.cw.p('i++;') ri.cw.block_end() ri.cw.block_end() @@ -2549,6 +2566,9 @@ def render_uapi(family, cw): defines = [] for const in family['definitions']: + if const.get('header'): + continue + if const['type'] != 'const': cw.writes_defines(defines) defines = [] diff --git a/tools/objtool/Documentation/objtool.txt b/tools/objtool/Documentation/objtool.txt index 7c3ee959b63c..28ac57b9e102 100644 --- a/tools/objtool/Documentation/objtool.txt +++ b/tools/objtool/Documentation/objtool.txt @@ -28,6 +28,15 @@ Objtool has the following features: sites, enabling the kernel to patch them inline, to prevent "thunk funneling" for both security and performance reasons +- Return thunk validation -- validates return thunks are used for + certain CPU mitigations including Retbleed and SRSO + +- Return thunk annotation -- annotates all return thunk sites so kernel + can patch them inline, depending on enabled mitigations + +- Return thunk training valiation -- validate that all entry paths + untrain a "safe return" before the first return (or call) + - Non-instrumentation validation -- validates non-instrumentable ("noinstr") code rules, preventing instrumentation in low-level C entry code @@ -53,6 +62,9 @@ Objtool has the following features: - Function entry annotation -- annotates function entries, enabling kernel function tracing +- Function preamble (prefix) annotation and/or symbol generation -- used + for FineIBT and call depth tracking + - Other toolchain hacks which will go unmentioned at this time... Each feature can be enabled individually or in combination using the @@ -197,19 +209,17 @@ To achieve the validation, objtool enforces the following rules: 1. Each callable function must be annotated as such with the ELF function type. In asm code, this is typically done using the - ENTRY/ENDPROC macros. If objtool finds a return instruction + SYM_FUNC_{START,END} macros. If objtool finds a return instruction outside of a function, it flags an error since that usually indicates callable code which should be annotated accordingly. This rule is needed so that objtool can properly identify each callable function in order to analyze its stack metadata. -2. Conversely, each section of code which is *not* callable should *not* - be annotated as an ELF function. The ENDPROC macro shouldn't be used - in this case. - - This rule is needed so that objtool can ignore non-callable code. - Such code doesn't have to follow any of the other rules. +2. Conversely, each section of code which is *not* callable, or is + otherwise doing funny things with the stack or registers, should + *not* be annotated as an ELF function. Rather, SYM_CODE_{START,END} + should be used along with unwind hints. 3. Each callable function which calls another function must have the correct frame pointer logic, if required by CONFIG_FRAME_POINTER or @@ -221,7 +231,7 @@ To achieve the validation, objtool enforces the following rules: function B, the _caller_ of function A will be skipped on the stack trace. -4. Dynamic jumps and jumps to undefined symbols are only allowed if: +4. Indirect jumps and jumps to undefined symbols are only allowed if: a) the jump is part of a switch statement; or @@ -304,29 +314,29 @@ the objtool maintainers. 001e 2823e: 80 ce 02 or $0x2,%dh ... + 2. file.o: warning: objtool: .text+0x53: unreachable instruction Objtool couldn't find a code path to reach the instruction. If the error is for an asm file, and the instruction is inside (or reachable from) a callable function, the function should be annotated - with the ENTRY/ENDPROC macros (ENDPROC is the important one). - Otherwise, the code should probably be annotated with the unwind hint - macros in asm/unwind_hints.h so objtool and the unwinder can know the - stack state associated with the code. + with the SYM_FUNC_START and SYM_FUNC_END macros. - If you're 100% sure the code won't affect stack traces, or if you're - a just a bad person, you can tell objtool to ignore it. See the - "Adding exceptions" section below. + Otherwise, SYM_CODE_START can be used. In that case the code needs + to be annotated with unwind hint macros. - If it's not actually in a callable function (e.g. kernel entry code), - change ENDPROC to END. + If you're sure the code won't affect the reliability of runtime stack + traces and want objtool to ignore it, see "Adding exceptions" below. -3. file.o: warning: objtool: foo+0x48c: bar() is missing a __noreturn annotation - The call from foo() to bar() doesn't return, but bar() is missing the - __noreturn annotation. NOTE: In addition to annotating the function - with __noreturn, please also add it to tools/objtool/noreturns.h. +3. file.o: warning: objtool: foo+0x48c: bar() missing __noreturn in .c/.h or NORETURN() in noreturns.h + + The call from foo() to bar() doesn't return, but bar() is incorrectly + annotated. A noreturn function must be marked __noreturn in both its + declaration and its definition, and must have a NORETURN() annotation + in tools/objtool/noreturns.h. + 4. file.o: warning: objtool: func(): can't find starting instruction or @@ -341,23 +351,21 @@ the objtool maintainers. This is a kernel entry/exit instruction like sysenter or iret. Such instructions aren't allowed in a callable function, and are most - likely part of the kernel entry code. They should usually not have - the callable function annotation (ENDPROC) and should always be - annotated with the unwind hint macros in asm/unwind_hints.h. + likely part of the kernel entry code. Such code should probably be + placed in a SYM_FUNC_CODE block with unwind hints. 6. file.o: warning: objtool: func()+0x26: sibling call from callable instruction with modified stack frame - This is a dynamic jump or a jump to an undefined symbol. Objtool - assumed it's a sibling call and detected that the frame pointer - wasn't first restored to its original state. + This is a branch to an UNDEF symbol. Objtool assumed it's a + sibling call and detected that the stack wasn't first restored to its + original state. - If it's not really a sibling call, you may need to move the - destination code to the local file. + If it's not really a sibling call, you may need to use unwind hints + and/or move the destination code to the local file. If the instruction is not actually in a callable function (e.g. - kernel entry code), change ENDPROC to END and annotate manually with - the unwind hint macros in asm/unwind_hints.h. + kernel entry code), use SYM_CODE_{START,END} and unwind hints. 7. file: warning: objtool: func()+0x5c: stack state mismatch @@ -373,8 +381,8 @@ the objtool maintainers. Another possibility is that the code has some asm or inline asm which does some unusual things to the stack or the frame pointer. In such - cases it's probably appropriate to use the unwind hint macros in - asm/unwind_hints.h. + cases it's probably appropriate to use SYM_FUNC_CODE with unwind + hints. 8. file.o: warning: objtool: funcA() falls through to next function funcB() @@ -384,17 +392,16 @@ the objtool maintainers. can fall through into the next function. There could be different reasons for this: - 1) funcA()'s last instruction is a call to a "noreturn" function like + a) funcA()'s last instruction is a call to a "noreturn" function like panic(). In this case the noreturn function needs to be added to objtool's hard-coded global_noreturns array. Feel free to bug the objtool maintainer, or you can submit a patch. - 2) funcA() uses the unreachable() annotation in a section of code + b) funcA() uses the unreachable() annotation in a section of code that is actually reachable. - 3) If funcA() calls an inline function, the object code for funcA() - might be corrupt due to a gcc bug. For more details, see: - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646 + c) Some undefined behavior like divide by zero. + 9. file.o: warning: objtool: funcA() call to funcB() with UACCESS enabled @@ -432,24 +439,26 @@ the objtool maintainers. This limitation can be overcome by massaging the alternatives with NOPs to shift the stack changes around so they no longer conflict. + 11. file.o: warning: unannotated intra-function call - This warning means that a direct call is done to a destination which - is not at the beginning of a function. If this is a legit call, you - can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL - directive right before the call. + This warning means that a direct call is done to a destination which + is not at the beginning of a function. If this is a legit call, you + can remove this warning by putting the ANNOTATE_INTRA_FUNCTION_CALL + directive right before the call. + 12. file.o: warning: func(): not an indirect call target - This means that objtool is running with --ibt and a function expected - to be an indirect call target is not. In particular, this happens for - init_module() or cleanup_module() if a module relies on these special - names and does not use module_init() / module_exit() macros to create - them. + This means that objtool is running with --ibt and a function + expected to be an indirect call target is not. In particular, this + happens for init_module() or cleanup_module() if a module relies on + these special names and does not use module_init() / module_exit() + macros to create them. If the error doesn't seem to make sense, it could be a bug in objtool. -Feel free to ask the objtool maintainer for help. +Feel free to ask objtool maintainers for help. Adding exceptions diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile index 7a65948892e5..8c20361dd100 100644 --- a/tools/objtool/Makefile +++ b/tools/objtool/Makefile @@ -37,7 +37,7 @@ OBJTOOL_CFLAGS := -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) $(LIBE OBJTOOL_LDFLAGS := $(LIBELF_LIBS) $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) # Allow old libelf to be used: -elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - | grep elf_getshdr) +elfshdr := $(shell echo '$(pound)include <libelf.h>' | $(HOSTCC) $(OBJTOOL_CFLAGS) -x c -E - 2>/dev/null | grep elf_getshdr) OBJTOOL_CFLAGS += $(if $(elfshdr),,-DLIBELF_USE_DEPRECATED) # Always want host compilation. diff --git a/tools/objtool/arch/loongarch/decode.c b/tools/objtool/arch/loongarch/decode.c index 69b66994f2a1..02e490555966 100644 --- a/tools/objtool/arch/loongarch/decode.c +++ b/tools/objtool/arch/loongarch/decode.c @@ -5,10 +5,7 @@ #include <asm/inst.h> #include <asm/orc_types.h> #include <linux/objtool_types.h> - -#ifndef EM_LOONGARCH -#define EM_LOONGARCH 258 -#endif +#include <arch/elf.h> int arch_ftrace_match(char *name) { @@ -363,3 +360,26 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) state->cfa.base = CFI_SP; state->cfa.offset = 0; } + +unsigned int arch_reloc_size(struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_LARCH_32: + case R_LARCH_32_PCREL: + return 4; + default: + return 8; + } +} + +unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table) +{ + switch (reloc_type(reloc)) { + case R_LARCH_32_PCREL: + case R_LARCH_64_PCREL: + return reloc->sym->offset + reloc_addend(reloc) - + (reloc_offset(reloc) - reloc_offset(table)); + default: + return reloc->sym->offset + reloc_addend(reloc); + } +} diff --git a/tools/objtool/arch/loongarch/include/arch/elf.h b/tools/objtool/arch/loongarch/include/arch/elf.h index 9623d663220e..ec79062c9554 100644 --- a/tools/objtool/arch/loongarch/include/arch/elf.h +++ b/tools/objtool/arch/loongarch/include/arch/elf.h @@ -18,6 +18,13 @@ #ifndef R_LARCH_32_PCREL #define R_LARCH_32_PCREL 99 #endif +#ifndef R_LARCH_64_PCREL +#define R_LARCH_64_PCREL 109 +#endif + +#ifndef EM_LOONGARCH +#define EM_LOONGARCH 258 +#endif #define R_NONE R_LARCH_NONE #define R_ABS32 R_LARCH_32 diff --git a/tools/objtool/arch/loongarch/special.c b/tools/objtool/arch/loongarch/special.c index 87230ed570fd..e39f86d97002 100644 --- a/tools/objtool/arch/loongarch/special.c +++ b/tools/objtool/arch/loongarch/special.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <string.h> #include <objtool/special.h> +#include <objtool/warn.h> bool arch_support_alt_relocation(struct special_alt *special_alt, struct instruction *insn, @@ -8,9 +10,164 @@ bool arch_support_alt_relocation(struct special_alt *special_alt, return false; } +struct table_info { + struct list_head jump_info; + unsigned long insn_offset; + unsigned long rodata_offset; +}; + +static void get_rodata_table_size_by_table_annotate(struct objtool_file *file, + struct instruction *insn, + unsigned long *table_size) +{ + struct section *rsec; + struct reloc *reloc; + struct list_head table_list; + struct table_info *orig_table; + struct table_info *next_table; + unsigned long tmp_insn_offset; + unsigned long tmp_rodata_offset; + + rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); + if (!rsec) + return; + + INIT_LIST_HEAD(&table_list); + + for_each_reloc(rsec, reloc) { + orig_table = malloc(sizeof(struct table_info)); + if (!orig_table) { + WARN("malloc failed"); + return; + } + + orig_table->insn_offset = reloc->sym->offset + reloc_addend(reloc); + reloc++; + orig_table->rodata_offset = reloc->sym->offset + reloc_addend(reloc); + + list_add_tail(&orig_table->jump_info, &table_list); + + if (reloc_idx(reloc) + 1 == sec_num_entries(rsec)) + break; + } + + list_for_each_entry(orig_table, &table_list, jump_info) { + next_table = list_next_entry(orig_table, jump_info); + list_for_each_entry_from(next_table, &table_list, jump_info) { + if (next_table->rodata_offset < orig_table->rodata_offset) { + tmp_insn_offset = next_table->insn_offset; + tmp_rodata_offset = next_table->rodata_offset; + next_table->insn_offset = orig_table->insn_offset; + next_table->rodata_offset = orig_table->rodata_offset; + orig_table->insn_offset = tmp_insn_offset; + orig_table->rodata_offset = tmp_rodata_offset; + } + } + } + + list_for_each_entry(orig_table, &table_list, jump_info) { + if (insn->offset == orig_table->insn_offset) { + next_table = list_next_entry(orig_table, jump_info); + if (&next_table->jump_info == &table_list) { + *table_size = 0; + return; + } + + while (next_table->rodata_offset == orig_table->rodata_offset) { + next_table = list_next_entry(next_table, jump_info); + if (&next_table->jump_info == &table_list) { + *table_size = 0; + return; + } + } + + *table_size = next_table->rodata_offset - orig_table->rodata_offset; + } + } +} + +static struct reloc *find_reloc_by_table_annotate(struct objtool_file *file, + struct instruction *insn, + unsigned long *table_size) +{ + struct section *rsec; + struct reloc *reloc; + unsigned long offset; + + rsec = find_section_by_name(file->elf, ".rela.discard.tablejump_annotate"); + if (!rsec) + return NULL; + + for_each_reloc(rsec, reloc) { + if (reloc->sym->sec->rodata) + continue; + + if (strcmp(insn->sec->name, reloc->sym->sec->name)) + continue; + + offset = reloc->sym->offset + reloc_addend(reloc); + if (insn->offset == offset) { + get_rodata_table_size_by_table_annotate(file, insn, table_size); + reloc++; + return reloc; + } + } + + return NULL; +} + +static struct reloc *find_reloc_of_rodata_c_jump_table(struct section *sec, + unsigned long offset, + unsigned long *table_size) +{ + struct section *rsec; + struct reloc *reloc; + + rsec = sec->rsec; + if (!rsec) + return NULL; + + for_each_reloc(rsec, reloc) { + if (reloc_offset(reloc) > offset) + break; + + if (!strcmp(reloc->sym->sec->name, C_JUMP_TABLE_SECTION)) { + *table_size = 0; + return reloc; + } + } + + return NULL; +} + struct reloc *arch_find_switch_table(struct objtool_file *file, struct instruction *insn, unsigned long *table_size) { - return NULL; + struct reloc *annotate_reloc; + struct reloc *rodata_reloc; + struct section *table_sec; + unsigned long table_offset; + + annotate_reloc = find_reloc_by_table_annotate(file, insn, table_size); + if (!annotate_reloc) { + annotate_reloc = find_reloc_of_rodata_c_jump_table( + insn->sec, insn->offset, table_size); + if (!annotate_reloc) + return NULL; + } + + table_sec = annotate_reloc->sym->sec; + table_offset = annotate_reloc->sym->offset + reloc_addend(annotate_reloc); + + /* + * Each table entry has a rela associated with it. The rela + * should reference text in the same function as the original + * instruction. + */ + rodata_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset); + if (!rodata_reloc) + return NULL; + + return rodata_reloc; } diff --git a/tools/objtool/arch/powerpc/decode.c b/tools/objtool/arch/powerpc/decode.c index 53b55690f320..7c0bf2429067 100644 --- a/tools/objtool/arch/powerpc/decode.c +++ b/tools/objtool/arch/powerpc/decode.c @@ -106,3 +106,17 @@ void arch_initial_func_cfi_state(struct cfi_init_state *state) state->regs[CFI_RA].base = CFI_CFA; state->regs[CFI_RA].offset = 0; } + +unsigned int arch_reloc_size(struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_PPC_REL32: + case R_PPC_ADDR32: + case R_PPC_UADDR32: + case R_PPC_PLT32: + case R_PPC_PLTREL32: + return 4; + default: + return 8; + } +} diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index fe1362c34564..7567c893f45e 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -850,5 +850,19 @@ bool arch_is_rethunk(struct symbol *sym) bool arch_is_embedded_insn(struct symbol *sym) { return !strcmp(sym->name, "retbleed_return_thunk") || + !strcmp(sym->name, "srso_alias_safe_ret") || !strcmp(sym->name, "srso_safe_ret"); } + +unsigned int arch_reloc_size(struct reloc *reloc) +{ + switch (reloc_type(reloc)) { + case R_X86_64_32: + case R_X86_64_32S: + case R_X86_64_PC32: + case R_X86_64_PLT32: + return 4; + default: + return 8; + } +} diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index 387d56a7f5fb..5f761f420b8c 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -6,6 +6,10 @@ #include <subcmd/parse-options.h> #include <string.h> #include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/sendfile.h> #include <objtool/builtin.h> #include <objtool/objtool.h> @@ -14,6 +18,8 @@ "error: objtool: " format "\n", \ ##__VA_ARGS__) +const char *objname; + struct opts opts; static const char * const check_usage[] = { @@ -71,7 +77,7 @@ static const struct option check_options[] = { OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), - OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"), + OPT_BOOLEAN(0, "orc", &opts.orc, "generate ORC metadata"), OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), OPT_BOOLEAN(0, "rethunk", &opts.rethunk, "validate and annotate rethunk usage"), OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), @@ -84,16 +90,17 @@ static const struct option check_options[] = { OPT_CALLBACK_OPTARG(0, "dump", NULL, NULL, "orc", "dump metadata", parse_dump), OPT_GROUP("Options:"), - OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), - OPT_BOOLEAN(0, "backup", &opts.backup, "create .orig files before modification"), - OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), - OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), - OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), - OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), - OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), - OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), - OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), + OPT_BOOLEAN(0, "backtrace", &opts.backtrace, "unwind on error"), + OPT_BOOLEAN(0, "dry-run", &opts.dryrun, "don't write modifications"), + OPT_BOOLEAN(0, "link", &opts.link, "object is a linked object"), + OPT_BOOLEAN(0, "module", &opts.module, "object is part of a kernel module"), + OPT_BOOLEAN(0, "mnop", &opts.mnop, "nop out mcount call sites"), + OPT_BOOLEAN(0, "no-unreachable", &opts.no_unreachable, "skip 'unreachable instruction' warnings"), + OPT_STRING('o', "output", &opts.output, "file", "output file name"), + OPT_BOOLEAN(0, "sec-address", &opts.sec_address, "print section addresses in warnings"), + OPT_BOOLEAN(0, "stats", &opts.stats, "print statistics"), OPT_BOOLEAN('v', "verbose", &opts.verbose, "verbose warnings"), + OPT_BOOLEAN(0, "Werror", &opts.werror, "return error on warnings"), OPT_END(), }; @@ -131,6 +138,26 @@ int cmd_parse_options(int argc, const char **argv, const char * const usage[]) static bool opts_valid(void) { + if (opts.mnop && !opts.mcount) { + ERROR("--mnop requires --mcount"); + return false; + } + + if (opts.noinstr && !opts.link) { + ERROR("--noinstr requires --link"); + return false; + } + + if (opts.ibt && !opts.link) { + ERROR("--ibt requires --link"); + return false; + } + + if (opts.unret && !opts.link) { + ERROR("--unret requires --link"); + return false; + } + if (opts.hack_jump_label || opts.hack_noinstr || opts.ibt || @@ -151,11 +178,6 @@ static bool opts_valid(void) return true; } - if (opts.unret && !opts.rethunk) { - ERROR("--unret requires --rethunk"); - return false; - } - if (opts.dump_orc) return true; @@ -163,76 +185,156 @@ static bool opts_valid(void) return false; } -static bool mnop_opts_valid(void) +static int copy_file(const char *src, const char *dst) { - if (opts.mnop && !opts.mcount) { - ERROR("--mnop requires --mcount"); - return false; + size_t to_copy, copied; + int dst_fd, src_fd; + struct stat stat; + off_t offset = 0; + + src_fd = open(src, O_RDONLY); + if (src_fd == -1) { + ERROR("can't open '%s' for reading", src); + return 1; } - return true; -} - -static bool link_opts_valid(struct objtool_file *file) -{ - if (opts.link) - return true; + dst_fd = open(dst, O_WRONLY | O_CREAT | O_TRUNC, 0400); + if (dst_fd == -1) { + ERROR("can't open '%s' for writing", dst); + return 1; + } - if (has_multiple_files(file->elf)) { - ERROR("Linked object detected, forcing --link"); - opts.link = true; - return true; + if (fstat(src_fd, &stat) == -1) { + perror("fstat"); + return 1; } - if (opts.noinstr) { - ERROR("--noinstr requires --link"); - return false; + if (fchmod(dst_fd, stat.st_mode) == -1) { + perror("fchmod"); + return 1; } - if (opts.ibt) { - ERROR("--ibt requires --link"); - return false; + for (to_copy = stat.st_size; to_copy > 0; to_copy -= copied) { + copied = sendfile(dst_fd, src_fd, &offset, to_copy); + if (copied == -1) { + perror("sendfile"); + return 1; + } } - if (opts.unret) { - ERROR("--unret requires --link"); - return false; + close(dst_fd); + close(src_fd); + return 0; +} + +static char **save_argv(int argc, const char **argv) +{ + char **orig_argv; + + orig_argv = calloc(argc, sizeof(char *)); + if (!orig_argv) { + perror("calloc"); + return NULL; } - return true; + for (int i = 0; i < argc; i++) { + orig_argv[i] = strdup(argv[i]); + if (!orig_argv[i]) { + perror("strdup"); + return NULL; + } + }; + + return orig_argv; } +#define ORIG_SUFFIX ".orig" + int objtool_run(int argc, const char **argv) { - const char *objname; struct objtool_file *file; - int ret; + char *backup = NULL; + char **orig_argv; + int ret = 0; - argc = cmd_parse_options(argc, argv, check_usage); - objname = argv[0]; + orig_argv = save_argv(argc, argv); + if (!orig_argv) + return 1; + + cmd_parse_options(argc, argv, check_usage); if (!opts_valid()) return 1; + objname = argv[0]; + if (opts.dump_orc) return orc_dump(objname); + if (!opts.dryrun && opts.output) { + /* copy original .o file to output file */ + if (copy_file(objname, opts.output)) + return 1; + + /* from here on, work directly on the output file */ + objname = opts.output; + } + file = objtool_open_read(objname); if (!file) - return 1; + goto err; - if (!mnop_opts_valid()) - return 1; - - if (!link_opts_valid(file)) - return 1; + if (!opts.link && has_multiple_files(file->elf)) { + ERROR("Linked object requires --link"); + goto err; + } ret = check(file); if (ret) - return ret; + goto err; - if (file->elf->changed) - return elf_write(file->elf); + if (!opts.dryrun && file->elf->changed && elf_write(file->elf)) + goto err; return 0; + +err: + if (opts.dryrun) + goto err_msg; + + if (opts.output) { + unlink(opts.output); + goto err_msg; + } + + /* + * Make a backup before kbuild deletes the file so the error + * can be recreated without recompiling or relinking. + */ + backup = malloc(strlen(objname) + strlen(ORIG_SUFFIX) + 1); + if (!backup) { + perror("malloc"); + return 1; + } + + strcpy(backup, objname); + strcat(backup, ORIG_SUFFIX); + if (copy_file(objname, backup)) + return 1; + +err_msg: + fprintf(stderr, "%s", orig_argv[0]); + + for (int i = 1; i < argc; i++) { + char *arg = orig_argv[i]; + + if (backup && !strcmp(arg, objname)) + fprintf(stderr, " %s -o %s", backup, objname); + else + fprintf(stderr, " %s", arg); + } + + fprintf(stderr, "\n"); + + return 1; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index ce973d9d8e6d..ca3435acc326 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1284,15 +1284,6 @@ static void annotate_call_site(struct objtool_file *file, if (!sym) sym = reloc->sym; - /* - * Alternative replacement code is just template code which is - * sometimes copied to the original instruction. For now, don't - * annotate it. (In the future we might consider annotating the - * original instruction if/when it ever makes sense to do so.) - */ - if (!strcmp(insn->sec->name, ".altinstr_replacement")) - return; - if (sym->static_call_tramp) { list_add_tail(&insn->call_node, &file->static_call_list); return; @@ -1350,7 +1341,8 @@ static void annotate_call_site(struct objtool_file *file, return; } - if (insn->type == INSN_CALL && !insn->sec->init) + if (insn->type == INSN_CALL && !insn->sec->init && + !insn->_call_dest->embedded_insn) list_add_tail(&insn->call_node, &file->call_list); if (!sibling && dead_end_function(file, sym)) @@ -1944,6 +1936,11 @@ out: return ret; } +__weak unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table) +{ + return reloc->sym->offset + reloc_addend(reloc); +} + static int add_jump_table(struct objtool_file *file, struct instruction *insn, struct reloc *next_table) { @@ -1954,6 +1951,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, unsigned int prev_offset = 0; struct reloc *reloc = table; struct alternative *alt; + unsigned long sym_offset; /* * Each @reloc is a switch table relocation which points to the target @@ -1968,12 +1966,13 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, break; /* Make sure the table entries are consecutive: */ - if (prev_offset && reloc_offset(reloc) != prev_offset + 8) + if (prev_offset && reloc_offset(reloc) != prev_offset + arch_reloc_size(reloc)) break; + sym_offset = arch_jump_table_sym_offset(reloc, table); + /* Detect function pointers from contiguous objects: */ - if (reloc->sym->sec == pfunc->sec && - reloc_addend(reloc) == pfunc->offset) + if (reloc->sym->sec == pfunc->sec && sym_offset == pfunc->offset) break; /* @@ -1981,10 +1980,10 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn, * which point to the end of the function. Ignore them. */ if (reloc->sym->sec == pfunc->sec && - reloc_addend(reloc) == pfunc->offset + pfunc->len) + sym_offset == pfunc->offset + pfunc->len) goto next; - dest_insn = find_insn(file, reloc->sym->sec, reloc_addend(reloc)); + dest_insn = find_insn(file, reloc->sym->sec, sym_offset); if (!dest_insn) break; @@ -2023,6 +2022,7 @@ static void find_jump_table(struct objtool_file *file, struct symbol *func, struct reloc *table_reloc; struct instruction *dest_insn, *orig_insn = insn; unsigned long table_size; + unsigned long sym_offset; /* * Backward search using the @first_jump_src links, these help avoid @@ -2046,7 +2046,10 @@ static void find_jump_table(struct objtool_file *file, struct symbol *func, table_reloc = arch_find_switch_table(file, insn, &table_size); if (!table_reloc) continue; - dest_insn = find_insn(file, table_reloc->sym->sec, reloc_addend(table_reloc)); + + sym_offset = table_reloc->sym->offset + reloc_addend(table_reloc); + + dest_insn = find_insn(file, table_reloc->sym->sec, sym_offset); if (!dest_insn || !insn_func(dest_insn) || insn_func(dest_insn)->pfunc != func) continue; @@ -4449,35 +4452,6 @@ static int validate_sls(struct objtool_file *file) return warnings; } -static bool ignore_noreturn_call(struct instruction *insn) -{ - struct symbol *call_dest = insn_call_dest(insn); - - /* - * FIXME: hack, we need a real noreturn solution - * - * Problem is, exc_double_fault() may or may not return, depending on - * whether CONFIG_X86_ESPFIX64 is set. But objtool has no visibility - * to the kernel config. - * - * Other potential ways to fix it: - * - * - have compiler communicate __noreturn functions somehow - * - remove CONFIG_X86_ESPFIX64 - * - read the .config file - * - add a cmdline option - * - create a generic objtool annotation format (vs a bunch of custom - * formats) and annotate it - */ - if (!strcmp(call_dest->name, "exc_double_fault")) { - /* prevent further unreachable warnings for the caller */ - insn->sym->warned = 1; - return true; - } - - return false; -} - static int validate_reachable_instructions(struct objtool_file *file) { struct instruction *insn, *prev_insn; @@ -4494,8 +4468,8 @@ static int validate_reachable_instructions(struct objtool_file *file) prev_insn = prev_insn_same_sec(file, insn); if (prev_insn && prev_insn->dead_end) { call_dest = insn_call_dest(prev_insn); - if (call_dest && !ignore_noreturn_call(prev_insn)) { - WARN_INSN(insn, "%s() is missing a __noreturn annotation", + if (call_dest) { + WARN_INSN(insn, "%s() missing __noreturn in .c/.h or NORETURN() in noreturns.h", call_dest->name); warnings++; continue; @@ -4565,7 +4539,7 @@ static int disas_warned_funcs(struct objtool_file *file) char *funcs = NULL, *tmp; for_each_sym(file, sym) { - if (sym->warned) { + if (sym->warnings) { if (!funcs) { funcs = malloc(strlen(sym->name) + 1); strcpy(funcs, sym->name); @@ -4623,8 +4597,10 @@ int check(struct objtool_file *file) init_cfi_state(&force_undefined_cfi); force_undefined_cfi.force_undefined = true; - if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) + if (!cfi_hash_alloc(1UL << (file->elf->symbol_bits - 3))) { + ret = -1; goto out; + } cfi_hash_add(&init_cfi); cfi_hash_add(&func_cfi); @@ -4641,7 +4617,7 @@ int check(struct objtool_file *file) if (opts.retpoline) { ret = validate_retpoline(file); if (ret < 0) - return ret; + goto out; warnings += ret; } @@ -4677,7 +4653,7 @@ int check(struct objtool_file *file) */ ret = validate_unrets(file); if (ret < 0) - return ret; + goto out; warnings += ret; } @@ -4740,7 +4716,7 @@ int check(struct objtool_file *file) if (opts.prefix) { ret = add_prefix_symbols(file); if (ret < 0) - return ret; + goto out; warnings += ret; } @@ -4772,9 +4748,18 @@ int check(struct objtool_file *file) out: /* - * For now, don't fail the kernel build on fatal warnings. These - * errors are still fairly common due to the growing matrix of - * supported toolchains and their recent pace of change. + * CONFIG_OBJTOOL_WERROR upgrades all warnings (and errors) to actual + * errors. + * + * Note that even "fatal" type errors don't actually return an error + * without CONFIG_OBJTOOL_WERROR. That probably needs improved at some + * point. */ + if (opts.werror && (ret || warnings)) { + if (warnings) + WARN("%d warning(s) upgraded to errors", warnings); + return 1; + } + return 0; } diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index 6f64d611faea..be4f4b62730c 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -1302,9 +1302,6 @@ int elf_write(struct elf *elf) struct section *sec; Elf_Scn *s; - if (opts.dryrun) - return 0; - /* Update changed relocation sections and section headers: */ list_for_each_entry(sec, &elf->sections, list) { if (sec->truncate) diff --git a/tools/objtool/include/objtool/arch.h b/tools/objtool/include/objtool/arch.h index d63b46a19f39..089a1acc48a8 100644 --- a/tools/objtool/include/objtool/arch.h +++ b/tools/objtool/include/objtool/arch.h @@ -97,4 +97,7 @@ int arch_rewrite_retpolines(struct objtool_file *file); bool arch_pc_relative_reloc(struct reloc *reloc); +unsigned int arch_reloc_size(struct reloc *reloc); +unsigned long arch_jump_table_sym_offset(struct reloc *reloc, struct reloc *table); + #endif /* _ARCH_H */ diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index fcca6662c8b4..0fafd0f7a209 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -29,15 +29,16 @@ struct opts { /* options: */ bool backtrace; - bool backup; bool dryrun; bool link; bool mnop; bool module; bool no_unreachable; + const char *output; bool sec_address; bool stats; bool verbose; + bool werror; }; extern struct opts opts; diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index d7e815c2fd15..223ac1c24b90 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -65,10 +65,10 @@ struct symbol { u8 return_thunk : 1; u8 fentry : 1; u8 profiling_func : 1; - u8 warned : 1; u8 embedded_insn : 1; u8 local_label : 1; u8 frame_pointer : 1; + u8 warnings : 2; struct list_head pv_target; struct reloc *relocs; }; diff --git a/tools/objtool/include/objtool/warn.h b/tools/objtool/include/objtool/warn.h index ac04d3fe4dd9..e72b9d630551 100644 --- a/tools/objtool/include/objtool/warn.h +++ b/tools/objtool/include/objtool/warn.h @@ -43,8 +43,10 @@ static inline char *offstr(struct section *sec, unsigned long offset) #define WARN(format, ...) \ fprintf(stderr, \ - "%s: warning: objtool: " format "\n", \ - objname, ##__VA_ARGS__) + "%s: %s: objtool: " format "\n", \ + objname, \ + opts.werror ? "error" : "warning", \ + ##__VA_ARGS__) #define WARN_FUNC(format, sec, offset, ...) \ ({ \ @@ -53,14 +55,22 @@ static inline char *offstr(struct section *sec, unsigned long offset) free(_str); \ }) +#define WARN_LIMIT 2 + #define WARN_INSN(insn, format, ...) \ ({ \ struct instruction *_insn = (insn); \ - if (!_insn->sym || !_insn->sym->warned) \ + BUILD_BUG_ON(WARN_LIMIT > 2); \ + if (!_insn->sym || _insn->sym->warnings < WARN_LIMIT) { \ WARN_FUNC(format, _insn->sec, _insn->offset, \ ##__VA_ARGS__); \ - if (_insn->sym) \ - _insn->sym->warned = 1; \ + if (_insn->sym) \ + _insn->sym->warnings++; \ + } else if (_insn->sym && _insn->sym->warnings == WARN_LIMIT) { \ + WARN_FUNC("skipping duplicate warning(s)", \ + _insn->sec, _insn->offset); \ + _insn->sym->warnings++; \ + } \ }) #define BT_INSN(insn, format, ...) \ diff --git a/tools/objtool/noreturns.h b/tools/objtool/noreturns.h index 6bb7edda3094..eacfe3b0a8d1 100644 --- a/tools/objtool/noreturns.h +++ b/tools/objtool/noreturns.h @@ -16,6 +16,7 @@ NORETURN(__tdx_hypercall_failed) NORETURN(__ubsan_handle_builtin_unreachable) NORETURN(__x64_sys_exit) NORETURN(__x64_sys_exit_group) +NORETURN(acpi_processor_ffh_play_dead) NORETURN(arch_cpu_idle_dead) NORETURN(bch2_trans_in_restart_error) NORETURN(bch2_trans_restart_error) @@ -34,6 +35,7 @@ NORETURN(kunit_try_catch_throw) NORETURN(machine_real_restart) NORETURN(make_task_dead) NORETURN(mpt_halt_firmware) +NORETURN(mwait_play_dead) NORETURN(nmi_panic_self_stop) NORETURN(panic) NORETURN(panic_smp_self_stop) diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c index f40febdd6e36..1c73fb62fd57 100644 --- a/tools/objtool/objtool.c +++ b/tools/objtool/objtool.c @@ -18,87 +18,19 @@ bool help; -const char *objname; static struct objtool_file file; -static bool objtool_create_backup(const char *_objname) +struct objtool_file *objtool_open_read(const char *filename) { - int len = strlen(_objname); - char *buf, *base, *name = malloc(len+6); - int s, d, l, t; - - if (!name) { - perror("failed backup name malloc"); - return false; - } - - strcpy(name, _objname); - strcpy(name + len, ".orig"); - - d = open(name, O_CREAT|O_WRONLY|O_TRUNC, 0644); - if (d < 0) { - perror("failed to create backup file"); - return false; - } - - s = open(_objname, O_RDONLY); - if (s < 0) { - perror("failed to open orig file"); - return false; - } - - buf = malloc(4096); - if (!buf) { - perror("failed backup data malloc"); - return false; - } - - while ((l = read(s, buf, 4096)) > 0) { - base = buf; - do { - t = write(d, base, l); - if (t < 0) { - perror("failed backup write"); - return false; - } - base += t; - l -= t; - } while (l); - } - - if (l < 0) { - perror("failed backup read"); - return false; - } - - free(name); - free(buf); - close(d); - close(s); - - return true; -} - -struct objtool_file *objtool_open_read(const char *_objname) -{ - if (objname) { - if (strcmp(objname, _objname)) { - WARN("won't handle more than one file at a time"); - return NULL; - } - return &file; + if (file.elf) { + WARN("won't handle more than one file at a time"); + return NULL; } - objname = _objname; - file.elf = elf_open_read(objname, O_RDWR); + file.elf = elf_open_read(filename, O_RDWR); if (!file.elf) return NULL; - if (opts.backup && !objtool_create_backup(objname)) { - WARN("can't create backup file"); - return NULL; - } - hash_init(file.insn_hash); INIT_LIST_HEAD(&file.retpoline_call_list); INIT_LIST_HEAD(&file.return_thunk_list); diff --git a/tools/objtool/orc_dump.c b/tools/objtool/orc_dump.c index a62247efb64f..05ef0e297837 100644 --- a/tools/objtool/orc_dump.c +++ b/tools/objtool/orc_dump.c @@ -10,7 +10,7 @@ #include <objtool/warn.h> #include <objtool/endianness.h> -int orc_dump(const char *_objname) +int orc_dump(const char *filename) { int fd, nr_entries, i, *orc_ip = NULL, orc_size = 0; struct orc_entry *orc = NULL; @@ -26,12 +26,9 @@ int orc_dump(const char *_objname) Elf_Data *data, *symtab = NULL, *rela_orc_ip = NULL; struct elf dummy_elf = {}; - - objname = _objname; - elf_version(EV_CURRENT); - fd = open(objname, O_RDONLY); + fd = open(filename, O_RDONLY); if (fd == -1) { perror("open"); return -1; diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index d3c6e10dce73..a4499e5a6f9c 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -26,8 +26,6 @@ FILES=( "include/linux/hash.h" "include/linux/list-sort.h" "include/uapi/linux/hw_breakpoint.h" - "arch/x86/include/asm/disabled-features.h" - "arch/x86/include/asm/required-features.h" "arch/x86/include/asm/cpufeatures.h" "arch/x86/include/asm/inat_types.h" "arch/x86/include/asm/emulate_prefix.h" diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 51a95239fe06..835123add0ed 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -52,8 +52,11 @@ DESTDIR ?= # and _should_ modify the PACKAGE_BUGREPORT definition VERSION:= $(shell ./utils/version-gen.sh) -LIB_MAJ= 0.0.1 -LIB_MIN= 1 +LIB_FIX= 1 +LIB_MIN= 0 +LIB_MAJ= 1 +LIB_VER= $(LIB_MAJ).$(LIB_MIN).$(LIB_FIX) + PACKAGE = cpupower PACKAGE_BUGREPORT = linux-pm@vger.kernel.org @@ -200,14 +203,14 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS) $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c -$(OUTPUT)libcpupower.so.$(LIB_MAJ): $(LIB_OBJS) +$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS) $(ECHO) " LD " $@ $(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \ - -Wl,-soname,libcpupower.so.$(LIB_MIN) $(LIB_OBJS) + -Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS) @ln -sf $(@F) $(OUTPUT)libcpupower.so - @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MIN) + @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ) -libcpupower: $(OUTPUT)libcpupower.so.$(LIB_MAJ) +libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER) # Let all .o files depend on its .c file and all headers # Might be worth to put this into utils/Makefile at some point of time @@ -217,7 +220,7 @@ $(OUTPUT)%.o: %.c $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c -$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_MAJ) +$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER) $(ECHO) " CC " $@ ifeq ($(strip $(STATIC)),true) $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@ @@ -262,7 +265,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot done; endif -compile-bench: $(OUTPUT)libcpupower.so.$(LIB_MAJ) +compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER) @V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) # we compile into subdirectories. if the target directory is not the diff --git a/tools/power/cpupower/bench/parse.c b/tools/power/cpupower/bench/parse.c index 080678d9d74e..bd67c758b33a 100644 --- a/tools/power/cpupower/bench/parse.c +++ b/tools/power/cpupower/bench/parse.c @@ -121,6 +121,10 @@ out_dir: struct config *prepare_default_config() { struct config *config = malloc(sizeof(struct config)); + if (!config) { + perror("malloc"); + return NULL; + } dprintf("loading defaults\n"); diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index 7a2ef691b20e..ce8dfb8e46ab 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -10,6 +10,7 @@ #include <stdio.h> #include <errno.h> #include <stdlib.h> +#include <string.h> #include "cpupower.h" #include "cpupower_intern.h" @@ -150,15 +151,25 @@ static int __compare(const void *t1, const void *t2) return 0; } +static int __compare_core_cpu_list(const void *t1, const void *t2) +{ + struct cpuid_core_info *top1 = (struct cpuid_core_info *)t1; + struct cpuid_core_info *top2 = (struct cpuid_core_info *)t2; + + return strcmp(top1->core_cpu_list, top2->core_cpu_list); +} + /* * Returns amount of cpus, negative on error, cpu_top must be * passed to cpu_topology_release to free resources * - * Array is sorted after ->pkg, ->core, then ->cpu + * Array is sorted after ->cpu_smt_list ->pkg, ->core */ int get_cpu_topology(struct cpupower_topology *cpu_top) { int cpu, last_pkg, cpus = sysconf(_SC_NPROCESSORS_CONF); + char path[SYSFS_PATH_MAX]; + char *last_cpu_list; cpu_top->core_info = malloc(sizeof(struct cpuid_core_info) * cpus); if (cpu_top->core_info == NULL) @@ -183,6 +194,34 @@ int get_cpu_topology(struct cpupower_topology *cpu_top) cpu_top->core_info[cpu].core = -1; continue; } + if (cpu_top->core_info[cpu].core == -1) { + strncpy(cpu_top->core_info[cpu].core_cpu_list, "-1", CPULIST_BUFFER); + continue; + } + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/topology/%s", + cpu, "core_cpus_list"); + if (cpupower_read_sysfs( + path, + cpu_top->core_info[cpu].core_cpu_list, + CPULIST_BUFFER) < 1) { + printf("Warning CPU%u has a 0 size core_cpus_list string", cpu); + } + } + + /* Count the number of distinct cpu lists to get the physical core + * count. + */ + qsort(cpu_top->core_info, cpus, sizeof(struct cpuid_core_info), + __compare_core_cpu_list); + + last_cpu_list = cpu_top->core_info[0].core_cpu_list; + cpu_top->cores = 1; + for (cpu = 1; cpu < cpus; cpu++) { + if (strcmp(cpu_top->core_info[cpu].core_cpu_list, last_cpu_list) != 0 && + cpu_top->core_info[cpu].pkg != -1) { + last_cpu_list = cpu_top->core_info[cpu].core_cpu_list; + cpu_top->cores++; + } } qsort(cpu_top->core_info, cpus, sizeof(struct cpuid_core_info), @@ -203,13 +242,6 @@ int get_cpu_topology(struct cpupower_topology *cpu_top) if (!(cpu_top->core_info[0].pkg == -1)) cpu_top->pkgs++; - /* Intel's cores count is not consecutively numbered, there may - * be a core_id of 3, but none of 2. Assume there always is 0 - * Get amount of cores by counting duplicates in a package - for (cpu = 0; cpu_top->core_info[cpu].pkg = 0 && cpu < cpus; cpu++) { - if (cpu_top->core_info[cpu].core == 0) - cpu_top->cores++; - */ return cpus; } diff --git a/tools/power/cpupower/lib/cpupower.h b/tools/power/cpupower/lib/cpupower.h index e4e4292eacec..2e67a080f203 100644 --- a/tools/power/cpupower/lib/cpupower.h +++ b/tools/power/cpupower/lib/cpupower.h @@ -2,6 +2,8 @@ #ifndef __CPUPOWER_CPUPOWER_H__ #define __CPUPOWER_CPUPOWER_H__ +#define CPULIST_BUFFER 5 + struct cpupower_topology { /* Amount of CPU cores, packages and threads per core in the system */ unsigned int cores; @@ -16,6 +18,7 @@ struct cpuid_core_info { int pkg; int core; int cpu; + char core_cpu_list[CPULIST_BUFFER]; /* flags */ unsigned int is_online:1; diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index f746099b5dac..ad493157f826 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -6,6 +6,7 @@ */ +#include <errno.h> #include <stdio.h> #include <unistd.h> #include <stdlib.h> @@ -91,7 +92,11 @@ int fill_string_with_spaces(char *s, int n) return 0; } -#define MAX_COL_WIDTH 6 +#define MAX_COL_WIDTH 6 +#define TOPOLOGY_DEPTH_PKG 3 +#define TOPOLOGY_DEPTH_CORE 2 +#define TOPOLOGY_DEPTH_CPU 1 + void print_header(int topology_depth) { int unsigned mon; @@ -113,12 +118,19 @@ void print_header(int topology_depth) } printf("\n"); - if (topology_depth > 2) + switch (topology_depth) { + case TOPOLOGY_DEPTH_PKG: printf(" PKG|"); - if (topology_depth > 1) + break; + case TOPOLOGY_DEPTH_CORE: printf("CORE|"); - if (topology_depth > 0) + break; + case TOPOLOGY_DEPTH_CPU: printf(" CPU|"); + break; + default: + return; + } for (mon = 0; mon < avail_monitors; mon++) { if (mon != 0) @@ -152,12 +164,19 @@ void print_results(int topology_depth, int cpu) cpu_top.core_info[cpu].pkg == -1) return; - if (topology_depth > 2) + switch (topology_depth) { + case TOPOLOGY_DEPTH_PKG: printf("%4d|", cpu_top.core_info[cpu].pkg); - if (topology_depth > 1) + break; + case TOPOLOGY_DEPTH_CORE: printf("%4d|", cpu_top.core_info[cpu].core); - if (topology_depth > 0) + break; + case TOPOLOGY_DEPTH_CPU: printf("%4d|", cpu_top.core_info[cpu].cpu); + break; + default: + return; + } for (mon = 0; mon < avail_monitors; mon++) { if (mon != 0) @@ -294,7 +313,10 @@ int fork_it(char **argv) if (!child_pid) { /* child */ - execvp(argv[0], argv); + if (execvp(argv[0], argv) == -1) { + printf("Invalid monitor command %s\n", argv[0]); + exit(errno); + } } else { /* parent */ if (child_pid == -1) { @@ -423,11 +445,13 @@ int cmd_monitor(int argc, char **argv) if (avail_monitors == 0) { printf(_("No HW Cstate monitors found\n")); + cpu_topology_release(cpu_top); return 1; } if (mode == list) { list_monitors(); + cpu_topology_release(cpu_top); exit(EXIT_SUCCESS); } @@ -448,15 +472,15 @@ int cmd_monitor(int argc, char **argv) /* ToDo: Topology parsing needs fixing first to do this more generically */ if (cpu_top.pkgs > 1) - print_header(3); + print_header(TOPOLOGY_DEPTH_PKG); else - print_header(1); + print_header(TOPOLOGY_DEPTH_CPU); for (cpu = 0; cpu < cpu_count; cpu++) { if (cpu_top.pkgs > 1) - print_results(3, cpu); + print_results(TOPOLOGY_DEPTH_PKG, cpu); else - print_results(1, cpu); + print_results(TOPOLOGY_DEPTH_CPU, cpu); } for (num = 0; num < avail_monitors; num++) { diff --git a/tools/power/x86/intel-speed-select/Makefile b/tools/power/x86/intel-speed-select/Makefile index 7221f2f55e8b..8d3a02a20f3d 100644 --- a/tools/power/x86/intel-speed-select/Makefile +++ b/tools/power/x86/intel-speed-select/Makefile @@ -13,7 +13,7 @@ endif # Do not use make's built-in rules # (this improves performance and avoids hard-to-debug behaviour); MAKEFLAGS += -r -override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include -I/usr/include/libnl3 +override CFLAGS += -O2 -Wall -g -D_GNU_SOURCE -I$(OUTPUT)include -I$(shell $(CC) -print-sysroot)/usr/include/libnl3 override LDFLAGS += -lnl-genl-3 -lnl-3 ALL_TARGETS := intel-speed-select diff --git a/tools/power/x86/intel-speed-select/isst-config.c b/tools/power/x86/intel-speed-select/isst-config.c index fadfb02b8611..eaa420ac848d 100644 --- a/tools/power/x86/intel-speed-select/isst-config.c +++ b/tools/power/x86/intel-speed-select/isst-config.c @@ -16,7 +16,7 @@ struct process_cmd_struct { int arg; }; -static const char *version_str = "v1.21"; +static const char *version_str = "v1.22"; static const int supported_api_ver = 3; static struct isst_if_platform_info isst_platform_info; @@ -46,8 +46,9 @@ static int force_online_offline; static int auto_mode; static int fact_enable_fail; static int cgroupv2; +static int max_pkg_id; static int max_die_id; -static int max_punit_id; +static int max_die_id_package_0; /* clos related */ static int current_clos = -1; @@ -557,6 +558,8 @@ void for_each_online_power_domain_in_set(void (*callback)(struct isst_id *, void if (id.pkg < 0 || id.die < 0 || id.punit < 0) continue; + id.die = id.die % (max_die_id_package_0 + 1); + valid_mask[id.pkg][id.die] = 1; if (cpus[id.pkg][id.die][id.punit] == -1) @@ -564,11 +567,11 @@ void for_each_online_power_domain_in_set(void (*callback)(struct isst_id *, void } for (i = 0; i < MAX_PACKAGE_COUNT; i++) { - if (max_die_id == max_punit_id) { + if (max_die_id > max_pkg_id) { for (k = 0; k < MAX_PUNIT_PER_DIE && k < MAX_DIE_PER_PACKAGE; k++) { id.cpu = cpus[i][k][k]; id.pkg = i; - id.die = k; + id.die = get_physical_die_id(id.cpu); id.punit = k; if (isst_is_punit_valid(&id)) callback(&id, arg1, arg2, arg3, arg4); @@ -586,7 +589,10 @@ void for_each_online_power_domain_in_set(void (*callback)(struct isst_id *, void for (k = 0; k < MAX_PUNIT_PER_DIE; k++) { id.cpu = cpus[i][j][k]; id.pkg = i; - id.die = j; + if (id.cpu >= 0) + id.die = get_physical_die_id(id.cpu); + else + id.die = id.pkg; id.punit = k; if (isst_is_punit_valid(&id)) callback(&id, arg1, arg2, arg3, arg4); @@ -788,6 +794,8 @@ static void create_cpu_map(void) cpu_map[i].die_id = die_id; cpu_map[i].core_id = core_id; + if (max_pkg_id < pkg_id) + max_pkg_id = pkg_id; punit_id = 0; @@ -812,8 +820,8 @@ static void create_cpu_map(void) if (max_die_id < die_id) max_die_id = die_id; - if (max_punit_id < cpu_map[i].punit_id) - max_punit_id = cpu_map[i].punit_id; + if (!pkg_id && max_die_id_package_0 < die_id) + max_die_id_package_0 = die_id; debug_printf( "map logical_cpu:%d core: %d die:%d pkg:%d punit:%d punit_cpu:%d punit_core:%d\n", diff --git a/tools/power/x86/intel-speed-select/isst-display.c b/tools/power/x86/intel-speed-select/isst-display.c index 07ebd08f3202..da5a59a4c545 100644 --- a/tools/power/x86/intel-speed-select/isst-display.c +++ b/tools/power/x86/intel-speed-select/isst-display.c @@ -173,7 +173,11 @@ static int print_package_info(struct isst_id *id, FILE *outf) if (out_format_is_json()) { if (api_version() > 1) { - if (id->cpu < 0) + if (id->die < 0 && id->cpu < 0) + snprintf(header, sizeof(header), + "package-%d:die-IO:powerdomain-%d:cpu-None", + id->pkg, id->punit); + else if (id->cpu < 0) snprintf(header, sizeof(header), "package-%d:die-%d:powerdomain-%d:cpu-None", id->pkg, id->die, id->punit); @@ -190,7 +194,10 @@ static int print_package_info(struct isst_id *id, FILE *outf) } snprintf(header, sizeof(header), "package-%d", id->pkg); format_and_print(outf, level++, header, NULL); - snprintf(header, sizeof(header), "die-%d", id->die); + if (id->die < 0) + snprintf(header, sizeof(header), "die-IO"); + else + snprintf(header, sizeof(header), "die-%d", id->die); format_and_print(outf, level++, header, NULL); if (api_version() > 1) { snprintf(header, sizeof(header), "powerdomain-%d", id->punit); diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8d5011a0bf60..26057af6b5a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1056,7 +1056,7 @@ static const struct platform_data turbostat_pdata[] = { * Missing support for * INTEL_ICELAKE * INTEL_ATOM_SILVERMONT_MID - * INTEL_ATOM_AIRMONT_MID + * INTEL_ATOM_SILVERMONT_MID2 * INTEL_ATOM_AIRMONT_NP */ { 0, NULL }, diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 7849405614b1..dc4333d23189 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -7,6 +7,13 @@ #ifndef __SCX_COMMON_BPF_H #define __SCX_COMMON_BPF_H +/* + * The generated kfunc prototypes in vmlinux.h are missing address space + * attributes which cause build failures. For now, suppress the generated + * prototypes. See https://github.com/sched-ext/scx/issues/1111. + */ +#define BPF_NO_KFUNC_PROTOTYPES + #ifdef LSP #define __bpf__ #include "../vmlinux.h" @@ -18,6 +25,7 @@ #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> #include "user_exit_info.h" +#include "enum_defs.autogen.h" #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -62,21 +70,28 @@ void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; +u32 scx_bpf_nr_node_ids(void) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +int scx_bpf_cpu_node(s32 cpu) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; +void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from @@ -84,6 +99,9 @@ u64 scx_bpf_now(void) __ksym __weak; */ #define BPF_FOR_EACH_ITER (&___it) +#define scx_read_event(e, name) \ + (bpf_core_field_exists((e)->name) ? (e)->name : 0) + static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} @@ -584,6 +602,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +#define READ_ONCE_ARENA(type, x) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE_ARENA(type, x, val) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + /* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index dc18b99e55cd..1dc76bd84296 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -16,6 +16,7 @@ #include <stdlib.h> #include <stdint.h> #include <errno.h> +#include "enum_defs.autogen.h" typedef uint8_t u8; typedef uint16_t u16; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 50e1499ae093..9252e1a00556 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -125,12 +125,107 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, false; \ }) +/** + * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on + * in a compatible way. We will preserve this __COMPAT helper until v6.16. + * + * @enq_flags: enqueue flags from ops.enqueue() + * + * Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags + */ +static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) +{ +#ifdef HAVE_SCX_ENQ_CPU_SELECTED + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED exists. + */ + + /* + * We should temporarily suspend the macro expansion of + * 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being + * rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED' + * is defined in 'scripts/gen_enums.py'. + */ +#pragma push_macro("SCX_ENQ_CPU_SELECTED") +#undef SCX_ENQ_CPU_SELECTED + u64 flag; + + /* + * When the kernel did not have SCX_ENQ_CPU_SELECTED, + * select_task_rq_scx() has never been skipped. Thus, this case + * should be considered that the CPU has already been selected. + */ + if (!bpf_core_enum_value_exists(enum scx_enq_flags, + SCX_ENQ_CPU_SELECTED)) + return true; + + flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED); + return enq_flags & flag; + + /* + * Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'. + */ +#pragma pop_macro("SCX_ENQ_CPU_SELECTED") +#else + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED does NOT exist. + */ + return true; +#endif /* HAVE_SCX_ENQ_CPU_SELECTED */ +} + + #define scx_bpf_now() \ (bpf_ksym_exists(scx_bpf_now) ? \ scx_bpf_now() : \ bpf_ktime_get_ns()) /* + * v6.15: Introduce event counters. + * + * Preserve the following macro until v6.17. + */ +#define __COMPAT_scx_bpf_events(events, size) \ + (bpf_ksym_exists(scx_bpf_events) ? \ + scx_bpf_events(events, size) : ({})) + +/* + * v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle + * cpumasks. + * + * Preserve the following __COMPAT_scx_*_node macros until v6.17. + */ +#define __COMPAT_scx_bpf_nr_node_ids() \ + (bpf_ksym_exists(scx_bpf_nr_node_ids) ? \ + scx_bpf_nr_node_ids() : 1U) + +#define __COMPAT_scx_bpf_cpu_node(cpu) \ + (bpf_ksym_exists(scx_bpf_cpu_node) ? \ + scx_bpf_cpu_node(cpu) : 0) + +#define __COMPAT_scx_bpf_get_idle_cpumask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ? \ + scx_bpf_get_idle_cpumask_node(node) : \ + scx_bpf_get_idle_cpumask()) \ + +#define __COMPAT_scx_bpf_get_idle_smtmask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ? \ + scx_bpf_get_idle_smtmask_node(node) : \ + scx_bpf_get_idle_smtmask()) + +#define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ? \ + scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_idle_cpu(cpus_allowed, flags)) + +#define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \ + scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_any_cpu(cpus_allowed, flags)) + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index b50280e2ba2b..35c67c5174ac 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -106,8 +106,20 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field return false; } -#define SCX_OPS_SWITCH_PARTIAL \ - __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") +#define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name) + +#define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE) +#define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST) +#define SCX_OPS_ENQ_EXITING SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING) +#define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL) +#define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED) +#define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP) +#define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE) + +#define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name) + +#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE) +#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE) static inline long scx_hotplug_seq(void) { diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h new file mode 100644 index 000000000000..6e6c45f14fe1 --- /dev/null +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -0,0 +1,120 @@ +/* + * WARNING: This file is autogenerated from gen_enum_defs.py [1]. + * + * [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py + */ + +#ifndef __ENUM_DEFS_AUTOGEN_H__ +#define __ENUM_DEFS_AUTOGEN_H__ + +#define HAVE_SCX_DSP_DFL_MAX_BATCH +#define HAVE_SCX_DSP_MAX_LOOPS +#define HAVE_SCX_WATCHDOG_MAX_TIMEOUT +#define HAVE_SCX_EXIT_BT_LEN +#define HAVE_SCX_EXIT_MSG_LEN +#define HAVE_SCX_EXIT_DUMP_DFL_LEN +#define HAVE_SCX_CPUPERF_ONE +#define HAVE_SCX_OPS_TASK_ITER_BATCH +#define HAVE_SCX_CPU_PREEMPT_RT +#define HAVE_SCX_CPU_PREEMPT_DL +#define HAVE_SCX_CPU_PREEMPT_STOP +#define HAVE_SCX_CPU_PREEMPT_UNKNOWN +#define HAVE_SCX_DEQ_SLEEP +#define HAVE_SCX_DEQ_CORE_SCHED_EXEC +#define HAVE_SCX_DSQ_FLAG_BUILTIN +#define HAVE_SCX_DSQ_FLAG_LOCAL_ON +#define HAVE_SCX_DSQ_INVALID +#define HAVE_SCX_DSQ_GLOBAL +#define HAVE_SCX_DSQ_LOCAL +#define HAVE_SCX_DSQ_LOCAL_ON +#define HAVE_SCX_DSQ_LOCAL_CPU_MASK +#define HAVE_SCX_DSQ_ITER_REV +#define HAVE___SCX_DSQ_ITER_HAS_SLICE +#define HAVE___SCX_DSQ_ITER_HAS_VTIME +#define HAVE___SCX_DSQ_ITER_USER_FLAGS +#define HAVE___SCX_DSQ_ITER_ALL_FLAGS +#define HAVE_SCX_DSQ_LNODE_ITER_CURSOR +#define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT +#define HAVE_SCX_ENQ_WAKEUP +#define HAVE_SCX_ENQ_HEAD +#define HAVE_SCX_ENQ_CPU_SELECTED +#define HAVE_SCX_ENQ_PREEMPT +#define HAVE_SCX_ENQ_REENQ +#define HAVE_SCX_ENQ_LAST +#define HAVE___SCX_ENQ_INTERNAL_MASK +#define HAVE_SCX_ENQ_CLEAR_OPSS +#define HAVE_SCX_ENQ_DSQ_PRIQ +#define HAVE_SCX_TASK_DSQ_ON_PRIQ +#define HAVE_SCX_TASK_QUEUED +#define HAVE_SCX_TASK_RESET_RUNNABLE_AT +#define HAVE_SCX_TASK_DEQD_FOR_SLEEP +#define HAVE_SCX_TASK_STATE_SHIFT +#define HAVE_SCX_TASK_STATE_BITS +#define HAVE_SCX_TASK_STATE_MASK +#define HAVE_SCX_TASK_CURSOR +#define HAVE_SCX_ECODE_RSN_HOTPLUG +#define HAVE_SCX_ECODE_ACT_RESTART +#define HAVE_SCX_EXIT_NONE +#define HAVE_SCX_EXIT_DONE +#define HAVE_SCX_EXIT_UNREG +#define HAVE_SCX_EXIT_UNREG_BPF +#define HAVE_SCX_EXIT_UNREG_KERN +#define HAVE_SCX_EXIT_SYSRQ +#define HAVE_SCX_EXIT_ERROR +#define HAVE_SCX_EXIT_ERROR_BPF +#define HAVE_SCX_EXIT_ERROR_STALL +#define HAVE_SCX_KF_UNLOCKED +#define HAVE_SCX_KF_CPU_RELEASE +#define HAVE_SCX_KF_DISPATCH +#define HAVE_SCX_KF_ENQUEUE +#define HAVE_SCX_KF_SELECT_CPU +#define HAVE_SCX_KF_REST +#define HAVE___SCX_KF_RQ_LOCKED +#define HAVE___SCX_KF_TERMINAL +#define HAVE_SCX_KICK_IDLE +#define HAVE_SCX_KICK_PREEMPT +#define HAVE_SCX_KICK_WAIT +#define HAVE_SCX_OPI_BEGIN +#define HAVE_SCX_OPI_NORMAL_BEGIN +#define HAVE_SCX_OPI_NORMAL_END +#define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN +#define HAVE_SCX_OPI_CPU_HOTPLUG_END +#define HAVE_SCX_OPI_END +#define HAVE_SCX_OPS_ENABLING +#define HAVE_SCX_OPS_ENABLED +#define HAVE_SCX_OPS_DISABLING +#define HAVE_SCX_OPS_DISABLED +#define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE +#define HAVE_SCX_OPS_ENQ_LAST +#define HAVE_SCX_OPS_ENQ_EXITING +#define HAVE_SCX_OPS_SWITCH_PARTIAL +#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT +#define HAVE_SCX_OPS_ALL_FLAGS +#define HAVE_SCX_OPSS_NONE +#define HAVE_SCX_OPSS_QUEUEING +#define HAVE_SCX_OPSS_QUEUED +#define HAVE_SCX_OPSS_DISPATCHING +#define HAVE_SCX_OPSS_QSEQ_SHIFT +#define HAVE_SCX_PICK_IDLE_CORE +#define HAVE_SCX_OPS_NAME_LEN +#define HAVE_SCX_SLICE_DFL +#define HAVE_SCX_SLICE_INF +#define HAVE_SCX_RQ_ONLINE +#define HAVE_SCX_RQ_CAN_STOP_TICK +#define HAVE_SCX_RQ_BAL_PENDING +#define HAVE_SCX_RQ_BAL_KEEP +#define HAVE_SCX_RQ_BYPASSING +#define HAVE_SCX_RQ_IN_WAKEUP +#define HAVE_SCX_RQ_IN_BALANCE +#define HAVE_SCX_TASK_NONE +#define HAVE_SCX_TASK_INIT +#define HAVE_SCX_TASK_READY +#define HAVE_SCX_TASK_ENABLED +#define HAVE_SCX_TASK_NR_STATES +#define HAVE_SCX_TG_ONLINE +#define HAVE_SCX_TG_INITED +#define HAVE_SCX_WAKE_FORK +#define HAVE_SCX_WAKE_TTWU +#define HAVE_SCX_WAKE_SYNC + +#endif /* __ENUM_DEFS_AUTOGEN_H__ */ diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 1e9f74525d8f..6ba6e610eeaa 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -10,6 +10,7 @@ #include <unistd.h> #include <inttypes.h> #include <signal.h> +#include <assert.h> #include <libgen.h> #include <bpf/bpf.h> #include <scx/common.h> @@ -60,14 +61,22 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids <= INT32_MAX); + while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; break; - case 'c': - skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + case 'c': { + u32 central_cpu = strtoul(optarg, NULL, 0); + if (central_cpu >= skel->rodata->nr_cpu_ids) { + fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids); + return -1; + } + skel->rodata->central_cpu = (s32)central_cpu; break; + } case 'v': verbose = true; break; @@ -96,7 +105,7 @@ restart: */ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); - CPU_ZERO(cpuset); + CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3a20bb0c014a..26c40ca4f36c 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -231,7 +231,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) } /* if select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && + if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); @@ -763,6 +763,8 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { + struct scx_event_stats events; + bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); @@ -772,6 +774,25 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) if (print_shared_dsq) dump_shared_dsq(); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SLICE_DFL", + scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; } diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 8daac70c2f9d..2694344274bf 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -35,6 +35,7 @@ TARGETS += filesystems/epoll TARGETS += filesystems/fat TARGETS += filesystems/overlayfs TARGETS += filesystems/statmount +TARGETS += filesystems/mount-notify TARGETS += firmware TARGETS += fpu TARGETS += ftrace @@ -113,6 +114,7 @@ endif TARGETS += tmpfs TARGETS += tpm2 TARGETS += tty +TARGETS += ublk TARGETS += uevent TARGETS += user_events TARGETS += vDSO diff --git a/tools/testing/selftests/arm64/fp/kernel-test.c b/tools/testing/selftests/arm64/fp/kernel-test.c index 348e8bef62c7..e3cec3723ffa 100644 --- a/tools/testing/selftests/arm64/fp/kernel-test.c +++ b/tools/testing/selftests/arm64/fp/kernel-test.c @@ -46,7 +46,6 @@ static void handle_kick_signal(int sig, siginfo_t *info, void *context) } static char *drivers[] = { - "crct10dif-arm64", "sha1-ce", "sha224-arm64", "sha224-arm64-neon", diff --git a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c index 303260a6dc65..3bfcd3848432 100644 --- a/tools/testing/selftests/arm64/mte/check_hugetlb_options.c +++ b/tools/testing/selftests/arm64/mte/check_hugetlb_options.c @@ -227,6 +227,8 @@ static int check_child_hugetlb_memory_mapping(int mem_type, int mode, int mappin int main(int argc, char *argv[]) { int err; + void *map_ptr; + unsigned long map_size; err = mte_default_setup(); if (err) @@ -243,6 +245,15 @@ int main(int argc, char *argv[]) return KSFT_FAIL; } + /* Check if MTE supports hugetlb mappings */ + map_size = default_huge_page_size(); + map_ptr = mmap(NULL, map_size, PROT_READ | PROT_MTE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); + if (map_ptr == MAP_FAILED) + ksft_exit_skip("PROT_MTE not supported with MAP_HUGETLB mappings\n"); + else + munmap(map_ptr, map_size); + /* Set test plan */ ksft_set_plan(12); @@ -270,13 +281,13 @@ int main(int argc, char *argv[]) "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + "Check child hugetlb memory with private mapping, sync error mode and mmap memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap memory\n"); + "Check child hugetlb memory with private mapping, async error mode and mmap memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + "Check child hugetlb memory with private mapping, sync error mode and mmap/mprotect memory\n"); evaluate_test(check_child_hugetlb_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE | MAP_HUGETLB), - "Check child hugetlb memory with private mapping, precise mode and mmap/mprotect memory\n"); + "Check child hugetlb memory with private mapping, async error mode and mmap/mprotect memory\n"); mte_restore_setup(); free_hugetlb(); diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 80844a5fb1fe..29541d486c5e 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -548,6 +548,34 @@ void close_netns(struct nstoken *token) free(token); } +int open_tuntap(const char *dev_name, bool need_mac) +{ + int err = 0; + struct ifreq ifr; + int fd = open("/dev/net/tun", O_RDWR); + + if (!ASSERT_GE(fd, 0, "open(/dev/net/tun)")) + return -1; + + ifr.ifr_flags = IFF_NO_PI | (need_mac ? IFF_TAP : IFF_TUN); + strncpy(ifr.ifr_name, dev_name, IFNAMSIZ - 1); + ifr.ifr_name[IFNAMSIZ - 1] = '\0'; + + err = ioctl(fd, TUNSETIFF, &ifr); + if (!ASSERT_OK(err, "ioctl(TUNSETIFF)")) { + close(fd); + return -1; + } + + err = fcntl(fd, F_SETFL, O_NONBLOCK); + if (!ASSERT_OK(err, "fcntl(O_NONBLOCK)")) { + close(fd); + return -1; + } + + return fd; +} + int get_socket_local_port(int sock_fd) { struct sockaddr_storage addr; diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index ebec8a8d6f81..9235976d0c50 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -8,6 +8,7 @@ typedef __u16 __sum16; #include <linux/if_ether.h> #include <linux/if_packet.h> +#include <linux/if_tun.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/ethtool.h> @@ -85,6 +86,8 @@ int get_socket_local_port(int sock_fd); int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param); +int open_tuntap(const char *dev_name, bool need_mac); + struct nstoken; /** * open_netns() - Switch to specified network namespace by name. diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h b/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h index fb1eb8c67361..ccec0fcdabc1 100644 --- a/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h +++ b/tools/testing/selftests/bpf/prog_tests/lwt_helpers.h @@ -5,7 +5,6 @@ #include <time.h> #include <net/if.h> -#include <linux/if_tun.h> #include <linux/icmp.h> #include "test_progs.h" @@ -37,34 +36,6 @@ static inline int netns_delete(void) return system("ip netns del " NETNS ">/dev/null 2>&1"); } -static int open_tuntap(const char *dev_name, bool need_mac) -{ - int err = 0; - struct ifreq ifr; - int fd = open("/dev/net/tun", O_RDWR); - - if (!ASSERT_GT(fd, 0, "open(/dev/net/tun)")) - return -1; - - ifr.ifr_flags = IFF_NO_PI | (need_mac ? IFF_TAP : IFF_TUN); - strncpy(ifr.ifr_name, dev_name, IFNAMSIZ - 1); - ifr.ifr_name[IFNAMSIZ - 1] = '\0'; - - err = ioctl(fd, TUNSETIFF, &ifr); - if (!ASSERT_OK(err, "ioctl(TUNSETIFF)")) { - close(fd); - return -1; - } - - err = fcntl(fd, F_SETFL, O_NONBLOCK); - if (!ASSERT_OK(err, "fcntl(O_NONBLOCK)")) { - close(fd); - return -1; - } - - return fd; -} - #define ICMP_PAYLOAD_SIZE 100 /* Match an ICMP packet with payload len ICMP_PAYLOAD_SIZE */ diff --git a/tools/testing/selftests/bpf/prog_tests/net_timestamping.c b/tools/testing/selftests/bpf/prog_tests/net_timestamping.c new file mode 100644 index 000000000000..dbfd87499b6b --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/net_timestamping.c @@ -0,0 +1,239 @@ +#include <linux/net_tstamp.h> +#include <sys/time.h> +#include <linux/errqueue.h> +#include "test_progs.h" +#include "network_helpers.h" +#include "net_timestamping.skel.h" + +#define CG_NAME "/net-timestamping-test" +#define NSEC_PER_SEC 1000000000LL + +static const char addr4_str[] = "127.0.0.1"; +static const char addr6_str[] = "::1"; +static struct net_timestamping *skel; +static const int cfg_payload_len = 30; +static struct timespec usr_ts; +static u64 delay_tolerance_nsec = 10000000000; /* 10 seconds */ +int SK_TS_SCHED; +int SK_TS_TXSW; +int SK_TS_ACK; + +static int64_t timespec_to_ns64(struct timespec *ts) +{ + return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec; +} + +static void validate_key(int tskey, int tstype) +{ + static int expected_tskey = -1; + + if (tstype == SCM_TSTAMP_SCHED) + expected_tskey = cfg_payload_len - 1; + + ASSERT_EQ(expected_tskey, tskey, "tskey mismatch"); + + expected_tskey = tskey; +} + +static void validate_timestamp(struct timespec *cur, struct timespec *prev) +{ + int64_t cur_ns, prev_ns; + + cur_ns = timespec_to_ns64(cur); + prev_ns = timespec_to_ns64(prev); + + ASSERT_LT(cur_ns - prev_ns, delay_tolerance_nsec, "latency"); +} + +static void test_socket_timestamp(struct scm_timestamping *tss, int tstype, + int tskey) +{ + static struct timespec prev_ts; + + validate_key(tskey, tstype); + + switch (tstype) { + case SCM_TSTAMP_SCHED: + validate_timestamp(&tss->ts[0], &usr_ts); + SK_TS_SCHED += 1; + break; + case SCM_TSTAMP_SND: + validate_timestamp(&tss->ts[0], &prev_ts); + SK_TS_TXSW += 1; + break; + case SCM_TSTAMP_ACK: + validate_timestamp(&tss->ts[0], &prev_ts); + SK_TS_ACK += 1; + break; + } + + prev_ts = tss->ts[0]; +} + +static void test_recv_errmsg_cmsg(struct msghdr *msg) +{ + struct sock_extended_err *serr = NULL; + struct scm_timestamping *tss = NULL; + struct cmsghdr *cm; + + for (cm = CMSG_FIRSTHDR(msg); + cm && cm->cmsg_len; + cm = CMSG_NXTHDR(msg, cm)) { + if (cm->cmsg_level == SOL_SOCKET && + cm->cmsg_type == SCM_TIMESTAMPING) { + tss = (void *)CMSG_DATA(cm); + } else if ((cm->cmsg_level == SOL_IP && + cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_RECVERR) || + (cm->cmsg_level == SOL_PACKET && + cm->cmsg_type == PACKET_TX_TIMESTAMP)) { + serr = (void *)CMSG_DATA(cm); + ASSERT_EQ(serr->ee_origin, SO_EE_ORIGIN_TIMESTAMPING, + "cmsg type"); + } + + if (serr && tss) + test_socket_timestamp(tss, serr->ee_info, + serr->ee_data); + } +} + +static bool socket_recv_errmsg(int fd) +{ + static char ctrl[1024 /* overprovision*/]; + char data[cfg_payload_len]; + static struct msghdr msg; + struct iovec entry; + int n = 0; + + memset(&msg, 0, sizeof(msg)); + memset(&entry, 0, sizeof(entry)); + memset(ctrl, 0, sizeof(ctrl)); + + entry.iov_base = data; + entry.iov_len = cfg_payload_len; + msg.msg_iov = &entry; + msg.msg_iovlen = 1; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + n = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (n == -1) + ASSERT_EQ(errno, EAGAIN, "recvmsg MSG_ERRQUEUE"); + + if (n >= 0) + test_recv_errmsg_cmsg(&msg); + + return n == -1; +} + +static void test_socket_timestamping(int fd) +{ + while (!socket_recv_errmsg(fd)); + + ASSERT_EQ(SK_TS_SCHED, 1, "SCM_TSTAMP_SCHED"); + ASSERT_EQ(SK_TS_TXSW, 1, "SCM_TSTAMP_SND"); + ASSERT_EQ(SK_TS_ACK, 1, "SCM_TSTAMP_ACK"); + + SK_TS_SCHED = 0; + SK_TS_TXSW = 0; + SK_TS_ACK = 0; +} + +static void test_tcp(int family, bool enable_socket_timestamping) +{ + struct net_timestamping__bss *bss; + char buf[cfg_payload_len]; + int sfd = -1, cfd = -1; + unsigned int sock_opt; + struct netns_obj *ns; + int cg_fd; + int ret; + + cg_fd = test__join_cgroup(CG_NAME); + if (!ASSERT_OK_FD(cg_fd, "join cgroup")) + return; + + ns = netns_new("net_timestamping_ns", true); + if (!ASSERT_OK_PTR(ns, "create ns")) + goto out; + + skel = net_timestamping__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skel")) + goto out; + + if (!ASSERT_OK(net_timestamping__attach(skel), "attach skel")) + goto out; + + skel->links.skops_sockopt = + bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd); + if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup")) + goto out; + + bss = skel->bss; + memset(bss, 0, sizeof(*bss)); + + skel->bss->monitored_pid = getpid(); + + sfd = start_server(family, SOCK_STREAM, + family == AF_INET6 ? addr6_str : addr4_str, 0, 0); + if (!ASSERT_OK_FD(sfd, "start_server")) + goto out; + + cfd = connect_to_fd(sfd, 0); + if (!ASSERT_OK_FD(cfd, "connect_to_fd_server")) + goto out; + + if (enable_socket_timestamping) { + sock_opt = SOF_TIMESTAMPING_SOFTWARE | + SOF_TIMESTAMPING_OPT_ID | + SOF_TIMESTAMPING_TX_SCHED | + SOF_TIMESTAMPING_TX_SOFTWARE | + SOF_TIMESTAMPING_TX_ACK; + ret = setsockopt(cfd, SOL_SOCKET, SO_TIMESTAMPING, + (char *) &sock_opt, sizeof(sock_opt)); + if (!ASSERT_OK(ret, "setsockopt SO_TIMESTAMPING")) + goto out; + + ret = clock_gettime(CLOCK_REALTIME, &usr_ts); + if (!ASSERT_OK(ret, "get user time")) + goto out; + } + + ret = write(cfd, buf, sizeof(buf)); + if (!ASSERT_EQ(ret, sizeof(buf), "send to server")) + goto out; + + if (enable_socket_timestamping) + test_socket_timestamping(cfd); + + ASSERT_EQ(bss->nr_active, 1, "nr_active"); + ASSERT_EQ(bss->nr_snd, 2, "nr_snd"); + ASSERT_EQ(bss->nr_sched, 1, "nr_sched"); + ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw"); + ASSERT_EQ(bss->nr_ack, 1, "nr_ack"); + +out: + if (sfd >= 0) + close(sfd); + if (cfd >= 0) + close(cfd); + net_timestamping__destroy(skel); + netns_free(ns); + close(cg_fd); +} + +void test_net_timestamping(void) +{ + if (test__start_subtest("INET4: bpf timestamping")) + test_tcp(AF_INET, false); + if (test__start_subtest("INET4: bpf and socket timestamping")) + test_tcp(AF_INET, true); + if (test__start_subtest("INET6: bpf timestamping")) + test_tcp(AF_INET6, false); + if (test__start_subtest("INET6: bpf and socket timestamping")) + test_tcp(AF_INET6, true); +} diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 937da9b7532a..b9d9f0a502ce 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -4,12 +4,20 @@ #include "test_xdp_context_test_run.skel.h" #include "test_xdp_meta.skel.h" -#define TX_ADDR "10.0.0.1" -#define RX_ADDR "10.0.0.2" #define RX_NAME "veth0" #define TX_NAME "veth1" #define TX_NETNS "xdp_context_tx" #define RX_NETNS "xdp_context_rx" +#define TAP_NAME "tap0" +#define TAP_NETNS "xdp_context_tuntap" + +#define TEST_PAYLOAD_LEN 32 +static const __u8 test_payload[TEST_PAYLOAD_LEN] = { + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, +}; void test_xdp_context_error(int prog_fd, struct bpf_test_run_opts opts, __u32 data_meta, __u32 data, __u32 data_end, @@ -112,7 +120,59 @@ void test_xdp_context_test_run(void) test_xdp_context_test_run__destroy(skel); } -void test_xdp_context_functional(void) +static int send_test_packet(int ifindex) +{ + int n, sock = -1; + __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; + + /* The ethernet header is not relevant for this test and doesn't need to + * be meaningful. + */ + struct ethhdr eth = { 0 }; + + memcpy(packet, ð, sizeof(eth)); + memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); + + sock = socket(AF_PACKET, SOCK_RAW, IPPROTO_RAW); + if (!ASSERT_GE(sock, 0, "socket")) + goto err; + + struct sockaddr_ll saddr = { + .sll_family = PF_PACKET, + .sll_ifindex = ifindex, + .sll_halen = ETH_ALEN + }; + n = sendto(sock, packet, sizeof(packet), 0, (struct sockaddr *)&saddr, + sizeof(saddr)); + if (!ASSERT_EQ(n, sizeof(packet), "sendto")) + goto err; + + close(sock); + return 0; + +err: + if (sock >= 0) + close(sock); + return -1; +} + +static void assert_test_result(struct test_xdp_meta *skel) +{ + int err; + __u32 map_key = 0; + __u8 map_value[TEST_PAYLOAD_LEN]; + + err = bpf_map__lookup_elem(skel->maps.test_result, &map_key, + sizeof(map_key), &map_value, + TEST_PAYLOAD_LEN, BPF_ANY); + if (!ASSERT_OK(err, "lookup test_result")) + return; + + ASSERT_MEMEQ(&map_value, &test_payload, TEST_PAYLOAD_LEN, + "test_result map contains test payload"); +} + +void test_xdp_context_veth(void) { LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); @@ -120,7 +180,7 @@ void test_xdp_context_functional(void) struct bpf_program *tc_prog, *xdp_prog; struct test_xdp_meta *skel = NULL; struct nstoken *nstoken = NULL; - int rx_ifindex; + int rx_ifindex, tx_ifindex; int ret; tx_ns = netns_new(TX_NETNS, false); @@ -138,7 +198,6 @@ void test_xdp_context_functional(void) if (!ASSERT_OK_PTR(nstoken, "setns rx_ns")) goto close; - SYS(close, "ip addr add " RX_ADDR "/24 dev " RX_NAME); SYS(close, "ip link set dev " RX_NAME " up"); skel = test_xdp_meta__open_and_load(); @@ -179,9 +238,17 @@ void test_xdp_context_functional(void) if (!ASSERT_OK_PTR(nstoken, "setns tx_ns")) goto close; - SYS(close, "ip addr add " TX_ADDR "/24 dev " TX_NAME); SYS(close, "ip link set dev " TX_NAME " up"); - ASSERT_OK(SYS_NOFAIL("ping -c 1 " RX_ADDR), "ping"); + + tx_ifindex = if_nametoindex(TX_NAME); + if (!ASSERT_GE(tx_ifindex, 0, "if_nametoindex tx")) + goto close; + + ret = send_test_packet(tx_ifindex); + if (!ASSERT_OK(ret, "send_test_packet")) + goto close; + + assert_test_result(skel); close: close_netns(nstoken); @@ -190,3 +257,67 @@ close: netns_free(tx_ns); } +void test_xdp_context_tuntap(void) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_INGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct netns_obj *ns = NULL; + struct test_xdp_meta *skel = NULL; + __u8 packet[sizeof(struct ethhdr) + TEST_PAYLOAD_LEN]; + int tap_fd = -1; + int tap_ifindex; + int ret; + + ns = netns_new(TAP_NETNS, true); + if (!ASSERT_OK_PTR(ns, "create and open ns")) + return; + + tap_fd = open_tuntap(TAP_NAME, true); + if (!ASSERT_GE(tap_fd, 0, "open_tuntap")) + goto close; + + SYS(close, "ip link set dev " TAP_NAME " up"); + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + goto close; + + tap_ifindex = if_nametoindex(TAP_NAME); + if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) + goto close; + + tc_hook.ifindex = tap_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(skel->progs.ing_cls); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(skel->progs.ing_xdp), + 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + /* The ethernet header is not relevant for this test and doesn't need to + * be meaningful. + */ + struct ethhdr eth = { 0 }; + + memcpy(packet, ð, sizeof(eth)); + memcpy(packet + sizeof(eth), test_payload, TEST_PAYLOAD_LEN); + + ret = write(tap_fd, packet, sizeof(packet)); + if (!ASSERT_EQ(ret, sizeof(packet), "write packet")) + goto close; + + assert_test_result(skel); + +close: + if (tap_fd >= 0) + close(tap_fd); + test_xdp_meta__destroy(skel); + netns_free(ns); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c index d22449c69363..164640db3a29 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c @@ -99,10 +99,10 @@ static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp, icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c index 8b072666f9d9..591c703f5032 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c @@ -99,10 +99,10 @@ static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp, icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk->icsk_retransmit_timer.expires; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 59843b430f76..eb6ed1b7b2ef 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -49,6 +49,7 @@ #define TCP_SAVED_SYN 28 #define TCP_CA_NAME_MAX 16 #define TCP_NAGLE_OFF 1 +#define TCP_RTO_MAX_MS 44 #define TCP_ECN_OK 1 #define TCP_ECN_QUEUE_CWR 2 diff --git a/tools/testing/selftests/bpf/progs/net_timestamping.c b/tools/testing/selftests/bpf/progs/net_timestamping.c new file mode 100644 index 000000000000..b4c2f0f2be11 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/net_timestamping.c @@ -0,0 +1,248 @@ +#include "vmlinux.h" +#include "bpf_tracing_net.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_misc.h" +#include "bpf_kfuncs.h" +#include <errno.h> + +__u32 monitored_pid = 0; + +int nr_active; +int nr_snd; +int nr_passive; +int nr_sched; +int nr_txsw; +int nr_ack; + +struct sk_stg { + __u64 sendmsg_ns; /* record ts when sendmsg is called */ +}; + +struct sk_tskey { + u64 cookie; + u32 tskey; +}; + +struct delay_info { + u64 sendmsg_ns; /* record ts when sendmsg is called */ + u32 sched_delay; /* SCHED_CB - sendmsg_ns */ + u32 snd_sw_delay; /* SND_SW_CB - SCHED_CB */ + u32 ack_delay; /* ACK_CB - SND_SW_CB */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct sk_stg); +} sk_stg_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct sk_tskey); + __type(value, struct delay_info); + __uint(max_entries, 1024); +} time_map SEC(".maps"); + +static u64 delay_tolerance_nsec = 10000000000; /* 10 second as an example */ + +extern int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, u64 flags) __ksym; + +static int bpf_test_sockopt(void *ctx, const struct sock *sk, int expected) +{ + int tmp, new = SK_BPF_CB_TX_TIMESTAMPING; + int opt = SK_BPF_CB_FLAGS; + int level = SOL_SOCKET; + + if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)) != expected) + return 1; + + if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) != expected || + (!expected && tmp != new)) + return 1; + + return 0; +} + +static bool bpf_test_access_sockopt(void *ctx, const struct sock *sk) +{ + if (bpf_test_sockopt(ctx, sk, -EOPNOTSUPP)) + return true; + return false; +} + +static bool bpf_test_access_load_hdr_opt(struct bpf_sock_ops *skops) +{ + u8 opt[3] = {0}; + int load_flags = 0; + int ret; + + ret = bpf_load_hdr_opt(skops, opt, sizeof(opt), load_flags); + if (ret != -EOPNOTSUPP) + return true; + + return false; +} + +static bool bpf_test_access_cb_flags_set(struct bpf_sock_ops *skops) +{ + int ret; + + ret = bpf_sock_ops_cb_flags_set(skops, 0); + if (ret != -EOPNOTSUPP) + return true; + + return false; +} + +/* In the timestamping callbacks, we're not allowed to call the following + * BPF CALLs for the safety concern. Return false if expected. + */ +static bool bpf_test_access_bpf_calls(struct bpf_sock_ops *skops, + const struct sock *sk) +{ + if (bpf_test_access_sockopt(skops, sk)) + return true; + + if (bpf_test_access_load_hdr_opt(skops)) + return true; + + if (bpf_test_access_cb_flags_set(skops)) + return true; + + return false; +} + +static bool bpf_test_delay(struct bpf_sock_ops *skops, const struct sock *sk) +{ + struct bpf_sock_ops_kern *skops_kern; + u64 timestamp = bpf_ktime_get_ns(); + struct skb_shared_info *shinfo; + struct delay_info dinfo = {0}; + struct sk_tskey key = {0}; + struct delay_info *val; + struct sk_buff *skb; + struct sk_stg *stg; + u64 prior_ts, delay; + + if (bpf_test_access_bpf_calls(skops, sk)) + return false; + + skops_kern = bpf_cast_to_kern_ctx(skops); + skb = skops_kern->skb; + shinfo = bpf_core_cast(skb->head + skb->end, struct skb_shared_info); + + key.cookie = bpf_get_socket_cookie(skops); + if (!key.cookie) + return false; + + if (skops->op == BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) { + stg = bpf_sk_storage_get(&sk_stg_map, (void *)sk, 0, 0); + if (!stg) + return false; + dinfo.sendmsg_ns = stg->sendmsg_ns; + bpf_sock_ops_enable_tx_tstamp(skops_kern, 0); + key.tskey = shinfo->tskey; + if (!key.tskey) + return false; + bpf_map_update_elem(&time_map, &key, &dinfo, BPF_ANY); + return true; + } + + key.tskey = shinfo->tskey; + if (!key.tskey) + return false; + + val = bpf_map_lookup_elem(&time_map, &key); + if (!val) + return false; + + switch (skops->op) { + case BPF_SOCK_OPS_TSTAMP_SCHED_CB: + val->sched_delay = timestamp - val->sendmsg_ns; + delay = val->sched_delay; + break; + case BPF_SOCK_OPS_TSTAMP_SND_SW_CB: + prior_ts = val->sched_delay + val->sendmsg_ns; + val->snd_sw_delay = timestamp - prior_ts; + delay = val->snd_sw_delay; + break; + case BPF_SOCK_OPS_TSTAMP_ACK_CB: + prior_ts = val->snd_sw_delay + val->sched_delay + val->sendmsg_ns; + val->ack_delay = timestamp - prior_ts; + delay = val->ack_delay; + break; + } + + if (delay >= delay_tolerance_nsec) + return false; + + /* Since it's the last one, remove from the map after latency check */ + if (skops->op == BPF_SOCK_OPS_TSTAMP_ACK_CB) + bpf_map_delete_elem(&time_map, &key); + + return true; +} + +SEC("fentry/tcp_sendmsg_locked") +int BPF_PROG(trace_tcp_sendmsg_locked, struct sock *sk, struct msghdr *msg, + size_t size) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 timestamp = bpf_ktime_get_ns(); + u32 flag = sk->sk_bpf_cb_flags; + struct sk_stg *stg; + + if (pid != monitored_pid || !flag) + return 0; + + stg = bpf_sk_storage_get(&sk_stg_map, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!stg) + return 0; + + stg->sendmsg_ns = timestamp; + nr_snd += 1; + return 0; +} + +SEC("sockops") +int skops_sockopt(struct bpf_sock_ops *skops) +{ + struct bpf_sock *bpf_sk = skops->sk; + const struct sock *sk; + + if (!bpf_sk) + return 1; + + sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk); + if (!sk) + return 1; + + switch (skops->op) { + case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: + nr_active += !bpf_test_sockopt(skops, sk, 0); + break; + case BPF_SOCK_OPS_TSTAMP_SENDMSG_CB: + if (bpf_test_delay(skops, sk)) + nr_snd += 1; + break; + case BPF_SOCK_OPS_TSTAMP_SCHED_CB: + if (bpf_test_delay(skops, sk)) + nr_sched += 1; + break; + case BPF_SOCK_OPS_TSTAMP_SND_SW_CB: + if (bpf_test_delay(skops, sk)) + nr_txsw += 1; + break; + case BPF_SOCK_OPS_TSTAMP_ACK_CB: + if (bpf_test_delay(skops, sk)) + nr_ack += 1; + break; + } + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/setget_sockopt.c b/tools/testing/selftests/bpf/progs/setget_sockopt.c index 6dd4318debbf..0107a24b7522 100644 --- a/tools/testing/selftests/bpf/progs/setget_sockopt.c +++ b/tools/testing/selftests/bpf/progs/setget_sockopt.c @@ -61,6 +61,9 @@ static const struct sockopt_test sol_tcp_tests[] = { { .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, }, { .opt = TCP_BPF_SOCK_OPS_CB_FLAGS, .new = BPF_SOCK_OPS_ALL_CB_FLAGS, .expected = BPF_SOCK_OPS_ALL_CB_FLAGS, }, + { .opt = TCP_BPF_DELACK_MAX, .new = 30000, .expected = 30000, }, + { .opt = TCP_BPF_RTO_MIN, .new = 30000, .expected = 30000, }, + { .opt = TCP_RTO_MAX_MS, .new = 2000, .expected = 2000, }, { .opt = 0, }, }; diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index fe2d71ae0e71..fcf6ca14f2ea 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -4,37 +4,50 @@ #include <bpf/bpf_helpers.h> -#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) -#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define META_SIZE 32 + #define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem +/* Demonstrates how metadata can be passed from an XDP program to a TC program + * using bpf_xdp_adjust_meta. + * For the sake of testing the metadata support in drivers, the XDP program uses + * a fixed-size payload after the Ethernet header as metadata. The TC program + * copies the metadata it receives into a map so it can be checked from + * userspace. + */ + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __uint(value_size, META_SIZE); +} test_result SEC(".maps"); + SEC("tc") int ing_cls(struct __sk_buff *ctx) { - __u8 *data, *data_meta, *data_end; - __u32 diff = 0; + __u8 *data, *data_meta; + __u32 key = 0; data_meta = ctx_ptr(ctx, data_meta); - data_end = ctx_ptr(ctx, data_end); data = ctx_ptr(ctx, data); - if (data + ETH_ALEN > data_end || - data_meta + round_up(ETH_ALEN, 4) > data) + if (data_meta + META_SIZE > data) return TC_ACT_SHOT; - diff |= ((__u32 *)data_meta)[0] ^ ((__u32 *)data)[0]; - diff |= ((__u16 *)data_meta)[2] ^ ((__u16 *)data)[2]; + bpf_map_update_elem(&test_result, &key, data_meta, BPF_ANY); - return diff ? TC_ACT_SHOT : TC_ACT_OK; + return TC_ACT_SHOT; } SEC("xdp") int ing_xdp(struct xdp_md *ctx) { - __u8 *data, *data_meta, *data_end; + __u8 *data, *data_meta, *data_end, *payload; + struct ethhdr *eth; int ret; - ret = bpf_xdp_adjust_meta(ctx, -round_up(ETH_ALEN, 4)); + ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); if (ret < 0) return XDP_DROP; @@ -42,11 +55,21 @@ int ing_xdp(struct xdp_md *ctx) data_end = ctx_ptr(ctx, data_end); data = ctx_ptr(ctx, data); - if (data + ETH_ALEN > data_end || - data_meta + round_up(ETH_ALEN, 4) > data) + eth = (struct ethhdr *)data; + payload = data + sizeof(struct ethhdr); + + if (payload + META_SIZE > data_end || + data_meta + META_SIZE > data) + return XDP_DROP; + + /* The Linux networking stack may send other packets on the test + * interface that interfere with the test. Just drop them. + * The test packets can be recognized by their ethertype of zero. + */ + if (eth->h_proto != 0) return XDP_DROP; - __builtin_memcpy(data_meta, data, ETH_ALEN); + __builtin_memcpy(data_meta, payload, META_SIZE); return XDP_PASS; } diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c b/tools/testing/selftests/bpf/xdp_hw_metadata.c index 6f7b15d6c6ed..3d8de0d4c96a 100644 --- a/tools/testing/selftests/bpf/xdp_hw_metadata.c +++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c @@ -13,6 +13,7 @@ * - UDP 9091 packets trigger TX reply * - TX HW timestamp is requested and reported back upon completion * - TX checksum is requested + * - TX launch time HW offload is requested for transmission */ #include <test_progs.h> @@ -37,6 +38,15 @@ #include <time.h> #include <unistd.h> #include <libgen.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> +#include <linux/pkt_sched.h> +#include <linux/pkt_cls.h> +#include <linux/ethtool.h> +#include <sys/socket.h> +#include <arpa/inet.h> #include "xdp_metadata.h" @@ -64,6 +74,18 @@ int rxq; bool skip_tx; __u64 last_hw_rx_timestamp; __u64 last_xdp_rx_timestamp; +__u64 last_launch_time; +__u64 launch_time_delta_to_hw_rx_timestamp; +int launch_time_queue; + +#define run_command(cmd, ...) \ +({ \ + char command[1024]; \ + memset(command, 0, sizeof(command)); \ + snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \ + fprintf(stderr, "Running: %s\n", command); \ + system(command); \ +}) void test__fail(void) { /* for network_helpers.c */ } @@ -298,6 +320,12 @@ static bool complete_tx(struct xsk *xsk, clockid_t clock_id) if (meta->completion.tx_timestamp) { __u64 ref_tstamp = gettime(clock_id); + if (launch_time_delta_to_hw_rx_timestamp) { + print_tstamp_delta("HW Launch-time", + "HW TX-complete-time", + last_launch_time, + meta->completion.tx_timestamp); + } print_tstamp_delta("HW TX-complete-time", "User TX-complete-time", meta->completion.tx_timestamp, ref_tstamp); print_tstamp_delta("XDP RX-time", "User TX-complete-time", @@ -395,6 +423,17 @@ static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id) xsk, ntohs(udph->check), ntohs(want_csum), meta->request.csum_start, meta->request.csum_offset); + /* Set the value of launch time */ + if (launch_time_delta_to_hw_rx_timestamp) { + meta->flags |= XDP_TXMD_FLAGS_LAUNCH_TIME; + meta->request.launch_time = last_hw_rx_timestamp + + launch_time_delta_to_hw_rx_timestamp; + last_launch_time = meta->request.launch_time; + print_tstamp_delta("HW RX-time", "HW Launch-time", + last_hw_rx_timestamp, + meta->request.launch_time); + } + memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */ tx_desc->options |= XDP_TX_METADATA; tx_desc->len = len; @@ -407,6 +446,7 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t const struct xdp_desc *rx_desc; struct pollfd fds[rxq + 1]; __u64 comp_addr; + __u64 deadline; __u64 addr; __u32 idx = 0; int ret; @@ -477,9 +517,15 @@ peek: if (ret) printf("kick_tx ret=%d\n", ret); - for (int j = 0; j < 500; j++) { + /* wait 1 second + cover launch time */ + deadline = gettime(clock_id) + + NANOSEC_PER_SEC + + launch_time_delta_to_hw_rx_timestamp; + while (true) { if (complete_tx(xsk, clock_id)) break; + if (gettime(clock_id) >= deadline) + break; usleep(10); } } @@ -608,6 +654,10 @@ static void print_usage(void) " -h Display this help and exit\n\n" " -m Enable multi-buffer XDP for larger MTU\n" " -r Don't generate AF_XDP reply (rx metadata only)\n" + " -l Delta of launch time relative to HW RX-time in ns\n" + " default: 0 ns (launch time request is disabled)\n" + " -L Tx Queue to be enabled with launch time offload\n" + " default: 0 (Tx Queue 0)\n" "Generate test packets on the other machine with:\n" " echo -n xdp | nc -u -q1 <dst_ip> 9091\n"; @@ -618,7 +668,7 @@ static void read_args(int argc, char *argv[]) { int opt; - while ((opt = getopt(argc, argv, "chmr")) != -1) { + while ((opt = getopt(argc, argv, "chmrl:L:")) != -1) { switch (opt) { case 'c': bind_flags &= ~XDP_USE_NEED_WAKEUP; @@ -634,6 +684,12 @@ static void read_args(int argc, char *argv[]) case 'r': skip_tx = true; break; + case 'l': + launch_time_delta_to_hw_rx_timestamp = atoll(optarg); + break; + case 'L': + launch_time_queue = atoll(optarg); + break; case '?': if (isprint(optopt)) fprintf(stderr, "Unknown option: -%c\n", optopt); @@ -657,23 +713,118 @@ static void read_args(int argc, char *argv[]) error(-1, errno, "Invalid interface name"); } +void clean_existing_configurations(void) +{ + /* Check and delete root qdisc if exists */ + if (run_command("sudo tc qdisc show dev %s | grep -q 'qdisc mqprio 8001:'", ifname) == 0) + run_command("sudo tc qdisc del dev %s root", ifname); + + /* Check and delete ingress qdisc if exists */ + if (run_command("sudo tc qdisc show dev %s | grep -q 'qdisc ingress ffff:'", ifname) == 0) + run_command("sudo tc qdisc del dev %s ingress", ifname); + + /* Check and delete ethtool filters if any exist */ + if (run_command("sudo ethtool -n %s | grep -q 'Filter:'", ifname) == 0) { + run_command("sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 sudo ethtool -N %s delete >&2", + ifname, ifname); + } +} + +#define MAX_TC 16 + int main(int argc, char *argv[]) { clockid_t clock_id = CLOCK_TAI; + struct bpf_program *prog; int server_fd = -1; + size_t map_len = 0; + size_t que_len = 0; + char *buf = NULL; + char *map = NULL; + char *que = NULL; + char *tmp = NULL; + int tc = 0; int ret; int i; - struct bpf_program *prog; - read_args(argc, argv); rxq = rxq_num(ifname); - printf("rxq: %d\n", rxq); + if (launch_time_queue >= rxq || launch_time_queue < 0) + error(1, 0, "Invalid launch_time_queue."); + + clean_existing_configurations(); + sleep(1); + + /* Enable tx and rx hardware timestamping */ hwtstamp_enable(ifname); + /* Prepare priority to traffic class map for tc-mqprio */ + for (i = 0; i < MAX_TC; i++) { + if (i < rxq) + tc = i; + + if (asprintf(&buf, "%d ", tc) == -1) { + printf("Failed to malloc buf for tc map.\n"); + goto free_mem; + } + + map_len += strlen(buf); + tmp = realloc(map, map_len + 1); + if (!tmp) { + printf("Failed to realloc tc map.\n"); + goto free_mem; + } + map = tmp; + strcat(map, buf); + free(buf); + buf = NULL; + } + + /* Prepare traffic class to hardware queue map for tc-mqprio */ + for (i = 0; i <= tc; i++) { + if (asprintf(&buf, "1@%d ", i) == -1) { + printf("Failed to malloc buf for tc queues.\n"); + goto free_mem; + } + + que_len += strlen(buf); + tmp = realloc(que, que_len + 1); + if (!tmp) { + printf("Failed to realloc tc queues.\n"); + goto free_mem; + } + que = tmp; + strcat(que, buf); + free(buf); + buf = NULL; + } + + /* Add mqprio qdisc */ + run_command("sudo tc qdisc add dev %s handle 8001: parent root mqprio num_tc %d map %squeues %shw 0", + ifname, tc + 1, map, que); + + /* To test launch time, send UDP packet with VLAN priority 1 to port 9091 */ + if (launch_time_delta_to_hw_rx_timestamp) { + /* Enable launch time hardware offload on launch_time_queue */ + run_command("sudo tc qdisc replace dev %s parent 8001:%d etf offload clockid CLOCK_TAI delta 500000", + ifname, launch_time_queue + 1); + sleep(1); + + /* Route incoming packet with VLAN priority 1 into launch_time_queue */ + if (run_command("sudo ethtool -N %s flow-type ether vlan 0x2000 vlan-mask 0x1FFF action %d", + ifname, launch_time_queue)) { + run_command("sudo tc qdisc add dev %s ingress", ifname); + run_command("sudo tc filter add dev %s parent ffff: protocol 802.1Q flower vlan_prio 1 hw_tc %d", + ifname, launch_time_queue); + } + + /* Enable VLAN tag stripping offload */ + run_command("sudo ethtool -K %s rxvlan on", ifname); + } + rx_xsk = malloc(sizeof(struct xsk) * rxq); if (!rx_xsk) error(1, ENOMEM, "malloc"); @@ -733,4 +884,11 @@ int main(int argc, char *argv[]) cleanup(); if (ret) error(1, -ret, "verify_metadata"); + + clean_existing_configurations(); + +free_mem: + free(buf); + free(map); + free(que); } diff --git a/tools/testing/selftests/drivers/net/.gitignore b/tools/testing/selftests/drivers/net/.gitignore new file mode 100644 index 000000000000..ec746f374e85 --- /dev/null +++ b/tools/testing/selftests/drivers/net/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +xdp_helper diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 137470bdee0c..0c95bd944d56 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -1,13 +1,18 @@ # SPDX-License-Identifier: GPL-2.0 +CFLAGS += $(KHDR_INCLUDES) TEST_INCLUDES := $(wildcard lib/py/*.py) \ $(wildcard lib/sh/*.sh) \ ../../net/net_helper.sh \ ../../net/lib.sh \ +TEST_GEN_FILES := xdp_helper + TEST_PROGS := \ netcons_basic.sh \ + netcons_fragmented_msg.sh \ netcons_overflow.sh \ + netcons_sysdata.sh \ ping.py \ queues.py \ stats.py \ diff --git a/tools/testing/selftests/drivers/net/README.rst b/tools/testing/selftests/drivers/net/README.rst index 3b6a29e6564b..eb838ae94844 100644 --- a/tools/testing/selftests/drivers/net/README.rst +++ b/tools/testing/selftests/drivers/net/README.rst @@ -107,7 +107,7 @@ On the target machine, running the tests will use netdevsim by default:: 1..1 # timeout set to 45 # selftests: drivers/net: ping.py - # KTAP version 1 + # TAP version 13 # 1..3 # ok 1 ping.test_v4 # ok 2 ping.test_v6 @@ -128,7 +128,7 @@ Create a config with remote info:: Run the test:: [/root] # ./ksft-net-drv/drivers/net/ping.py - KTAP version 1 + TAP version 13 1..3 ok 1 ping.test_v4 ok 2 ping.test_v6 # SKIP Test requires IPv6 connectivity diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config index a2d8af60876d..f27172ddee0a 100644 --- a/tools/testing/selftests/drivers/net/config +++ b/tools/testing/selftests/drivers/net/config @@ -4,3 +4,4 @@ CONFIG_CONFIGFS_FS=y CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_NETCONSOLE_EXTENDED_LOG=y +CONFIG_XDP_SOCKETS=y diff --git a/tools/testing/selftests/drivers/net/hds.py b/tools/testing/selftests/drivers/net/hds.py index 873f5219e41d..7cc74faed743 100755 --- a/tools/testing/selftests/drivers/net/hds.py +++ b/tools/testing/selftests/drivers/net/hds.py @@ -20,8 +20,7 @@ def _get_hds_mode(cfg, netnl) -> str: def _xdp_onoff(cfg): - test_dir = os.path.dirname(os.path.realpath(__file__)) - prog = test_dir + "/../../net/lib/xdp_dummy.bpf.o" + prog = cfg.rpath("../../net/lib/xdp_dummy.bpf.o") ip("link set dev %s xdp obj %s sec xdp" % (cfg.ifname, prog)) ip("link set dev %s xdp off" % cfg.ifname) diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 21ba64ce1e34..cde5814ff9a7 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -10,11 +10,14 @@ TEST_PROGS = \ ethtool_rmon.sh \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ + irq.py \ loopback.sh \ nic_link_layer.py \ nic_performance.py \ pp_alloc_fail.py \ rss_ctx.py \ + rss_input_xfrm.py \ + tso.py \ # TEST_FILES := \ @@ -32,9 +35,12 @@ TEST_INCLUDES := \ # YNL files, must be before "include ..lib.mk" YNL_GEN_FILES := ncdevmem TEST_GEN_FILES += $(YNL_GEN_FILES) +TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) include ../../../lib.mk # YNL build YNL_GENS := ethtool netdev include ../../../net/ynl.mk + +include ../../../net/bpf.mk diff --git a/tools/testing/selftests/drivers/net/hw/csum.py b/tools/testing/selftests/drivers/net/hw/csum.py index cb40497faee4..701aca1361e0 100755 --- a/tools/testing/selftests/drivers/net/hw/csum.py +++ b/tools/testing/selftests/drivers/net/hw/csum.py @@ -9,15 +9,12 @@ from lib.py import ksft_run, ksft_exit, KsftSkipEx from lib.py import EthtoolFamily, NetDrvEpEnv from lib.py import bkg, cmd, wait_port_listen -def test_receive(cfg, ipv4=False, extra_args=None): +def test_receive(cfg, ipver="6", extra_args=None): """Test local nic checksum receive. Remote host sends crafted packets.""" if not cfg.have_rx_csum: raise KsftSkipEx(f"Test requires rx checksum offload on {cfg.ifname}") - if ipv4: - ip_args = f"-4 -S {cfg.remote_v4} -D {cfg.v4}" - else: - ip_args = f"-6 -S {cfg.remote_v6} -D {cfg.v6}" + ip_args = f"-{ipver} -S {cfg.remote_addr_v[ipver]} -D {cfg.addr_v[ipver]}" rx_cmd = f"{cfg.bin_local} -i {cfg.ifname} -n 100 {ip_args} -r 1 -R {extra_args}" tx_cmd = f"{cfg.bin_remote} -i {cfg.ifname} -n 100 {ip_args} -r 1 -T {extra_args}" @@ -27,17 +24,14 @@ def test_receive(cfg, ipv4=False, extra_args=None): cmd(tx_cmd, host=cfg.remote) -def test_transmit(cfg, ipv4=False, extra_args=None): +def test_transmit(cfg, ipver="6", extra_args=None): """Test local nic checksum transmit. Remote host verifies packets.""" if (not cfg.have_tx_csum_generic and - not (cfg.have_tx_csum_ipv4 and ipv4) and - not (cfg.have_tx_csum_ipv6 and not ipv4)): + not (cfg.have_tx_csum_ipv4 and ipver == "4") and + not (cfg.have_tx_csum_ipv6 and ipver == "6")): raise KsftSkipEx(f"Test requires tx checksum offload on {cfg.ifname}") - if ipv4: - ip_args = f"-4 -S {cfg.v4} -D {cfg.remote_v4}" - else: - ip_args = f"-6 -S {cfg.v6} -D {cfg.remote_v6}" + ip_args = f"-{ipver} -S {cfg.addr_v[ipver]} -D {cfg.remote_addr_v[ipver]}" # Cannot randomize input when calculating zero checksum if extra_args != "-U -Z": @@ -51,26 +45,20 @@ def test_transmit(cfg, ipv4=False, extra_args=None): cmd(tx_cmd) -def test_builder(name, cfg, ipv4=False, tx=False, extra_args=""): +def test_builder(name, cfg, ipver="6", tx=False, extra_args=""): """Construct specific tests from the common template. Most tests follow the same basic pattern, differing only in Direction of the test and optional flags passed to csum.""" def f(cfg): - if ipv4: - cfg.require_v4() - else: - cfg.require_v6() + cfg.require_ipver(ipver) if tx: - test_transmit(cfg, ipv4, extra_args) + test_transmit(cfg, ipver, extra_args) else: - test_receive(cfg, ipv4, extra_args) + test_receive(cfg, ipver, extra_args) - if ipv4: - f.__name__ = "ipv4_" + name - else: - f.__name__ = "ipv6_" + name + f.__name__ = f"ipv{ipver}_" + name return f @@ -100,19 +88,19 @@ def main() -> None: with NetDrvEpEnv(__file__, nsim_test=False) as cfg: check_nic_features(cfg) - cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../net/lib/csum") + cfg.bin_local = cfg.rpath("../../../net/lib/csum") cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) cases = [] - for ipv4 in [True, False]: - cases.append(test_builder("rx_tcp", cfg, ipv4, False, "-t")) - cases.append(test_builder("rx_tcp_invalid", cfg, ipv4, False, "-t -E")) + for ipver in ["4", "6"]: + cases.append(test_builder("rx_tcp", cfg, ipver, False, "-t")) + cases.append(test_builder("rx_tcp_invalid", cfg, ipver, False, "-t -E")) - cases.append(test_builder("rx_udp", cfg, ipv4, False, "")) - cases.append(test_builder("rx_udp_invalid", cfg, ipv4, False, "-E")) + cases.append(test_builder("rx_udp", cfg, ipver, False, "")) + cases.append(test_builder("rx_udp_invalid", cfg, ipver, False, "-E")) - cases.append(test_builder("tx_udp_csum_offload", cfg, ipv4, True, "-U")) - cases.append(test_builder("tx_udp_zero_checksum", cfg, ipv4, True, "-U -Z")) + cases.append(test_builder("tx_udp_csum_offload", cfg, ipver, True, "-U")) + cases.append(test_builder("tx_udp_zero_checksum", cfg, ipver, True, "-U -Z")) ksft_run(cases=cases, args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 1223f0f5c10c..3947e9157115 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -21,15 +21,15 @@ def require_devmem(cfg): @ksft_disruptive def check_rx(cfg) -> None: - cfg.require_v6() + cfg.require_ipver("6") require_devmem(cfg) port = rand_port() - listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.v6} -p {port}" + listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}" with bkg(listen_cmd) as socat: wait_port_listen(port) - cmd(f"echo -e \"hello\\nworld\"| socat -u - TCP6:[{cfg.v6}]:{port}", host=cfg.remote, shell=True) + cmd(f"echo -e \"hello\\nworld\"| socat -u - TCP6:[{cfg.addr_v['6']}]:{port}", host=cfg.remote, shell=True) ksft_eq(socat.stdout.strip(), "hello\nworld") diff --git a/tools/testing/selftests/drivers/net/hw/irq.py b/tools/testing/selftests/drivers/net/hw/irq.py new file mode 100755 index 000000000000..42ab98370245 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/irq.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_ge, ksft_eq +from lib.py import KsftSkipEx +from lib.py import ksft_disruptive +from lib.py import EthtoolFamily, NetdevFamily +from lib.py import NetDrvEnv +from lib.py import cmd, ip, defer + + +def read_affinity(irq) -> str: + with open(f'/proc/irq/{irq}/smp_affinity', 'r') as fp: + return fp.read().lstrip("0,").strip() + + +def write_affinity(irq, what) -> str: + if what != read_affinity(irq): + with open(f'/proc/irq/{irq}/smp_affinity', 'w') as fp: + fp.write(what) + + +def check_irqs_reported(cfg) -> None: + """ Check that device reports IRQs for NAPI instances """ + napis = cfg.netnl.napi_get({"ifindex": cfg.ifindex}, dump=True) + irqs = sum(['irq' in x for x in napis]) + + ksft_ge(irqs, 1) + ksft_eq(irqs, len(napis)) + + +def _check_reconfig(cfg, reconfig_cb) -> None: + napis = cfg.netnl.napi_get({"ifindex": cfg.ifindex}, dump=True) + for n in reversed(napis): + if 'irq' in n: + break + else: + raise KsftSkipEx(f"Device has no NAPI with IRQ attribute (#napis: {len(napis)}") + + old = read_affinity(n['irq']) + # pick an affinity that's not the current one + new = "3" if old != "3" else "5" + write_affinity(n['irq'], new) + defer(write_affinity, n['irq'], old) + + reconfig_cb(cfg) + + ksft_eq(read_affinity(n['irq']), new, comment="IRQ affinity changed after reconfig") + + +def check_reconfig_queues(cfg) -> None: + def reconfig(cfg) -> None: + channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}}) + if channels['combined-count'] == 0: + rx_type = 'rx' + else: + rx_type = 'combined' + cur_queue_cnt = channels[f'{rx_type}-count'] + max_queue_cnt = channels[f'{rx_type}-max'] + + cmd(f"ethtool -L {cfg.ifname} {rx_type} 1") + cmd(f"ethtool -L {cfg.ifname} {rx_type} {max_queue_cnt}") + cmd(f"ethtool -L {cfg.ifname} {rx_type} {cur_queue_cnt}") + + _check_reconfig(cfg, reconfig) + + +def check_reconfig_xdp(cfg) -> None: + def reconfig(cfg) -> None: + ip(f"link set dev %s xdp obj %s sec xdp" % + (cfg.ifname, cfg.rpath("xdp_dummy.bpf.o"))) + ip(f"link set dev %s xdp off" % cfg.ifname) + + _check_reconfig(cfg, reconfig) + + +@ksft_disruptive +def check_down(cfg) -> None: + def reconfig(cfg) -> None: + ip("link set dev %s down" % cfg.ifname) + ip("link set dev %s up" % cfg.ifname) + + _check_reconfig(cfg, reconfig) + + +def main() -> None: + with NetDrvEnv(__file__, nsim_test=False) as cfg: + cfg.ethnl = EthtoolFamily() + cfg.netnl = NetdevFamily() + + ksft_run([check_irqs_reported, check_reconfig_queues, + check_reconfig_xdp, check_down], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 19a6969643f4..2bf14ac2b8c6 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -50,7 +50,6 @@ #include <linux/memfd.h> #include <linux/dma-buf.h> #include <linux/udmabuf.h> -#include <libmnl/libmnl.h> #include <linux/types.h> #include <linux/netlink.h> #include <linux/genetlink.h> diff --git a/tools/testing/selftests/drivers/net/hw/rss_ctx.py b/tools/testing/selftests/drivers/net/hw/rss_ctx.py index 319aaa004c40..ca60ae325c22 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_ctx.py +++ b/tools/testing/selftests/drivers/net/hw/rss_ctx.py @@ -4,7 +4,8 @@ import datetime import random import re -from lib.py import ksft_run, ksft_pr, ksft_exit, ksft_eq, ksft_ne, ksft_ge, ksft_lt, ksft_true +from lib.py import ksft_run, ksft_pr, ksft_exit +from lib.py import ksft_eq, ksft_ne, ksft_ge, ksft_in, ksft_lt, ksft_true, ksft_raises from lib.py import NetDrvEpEnv from lib.py import EthtoolFamily, NetdevFamily from lib.py import KsftSkipEx, KsftFailEx @@ -58,6 +59,14 @@ def require_ntuple(cfg): raise KsftSkipEx("Ntuple filters not enabled on the device: " + str(features["ntuple-filters"])) +def require_context_cnt(cfg, need_cnt): + # There's no good API to get the context count, so the tests + # which try to add a lot opportunisitically set the count they + # discovered. Careful with test ordering! + if need_cnt and cfg.context_cnt and cfg.context_cnt < need_cnt: + raise KsftSkipEx(f"Test requires at least {need_cnt} contexts, but device only has {cfg.context_cnt}") + + # Get Rx packet counts for all queues, as a simple list of integers # if @prev is specified the prev counts will be subtracted def _get_rx_cnts(cfg, prev=None): @@ -383,7 +392,7 @@ def test_rss_context_dump(cfg): # Sanity-check the results for data in ctxs: - ksft_ne(set(data['indir']), {0}, "indir table is all zero") + ksft_ne(set(data.get('indir', [1])), {0}, "indir table is all zero") ksft_ne(set(data.get('hkey', [1])), {0}, "key is all zero") # More specific checks @@ -456,6 +465,8 @@ def test_rss_context(cfg, ctx_cnt=1, create_with_cfg=None): raise ksft_pr(f"Failed to create context {i + 1}, trying to test what we got") ctx_cnt = i + if cfg.context_cnt is None: + cfg.context_cnt = ctx_cnt break _rss_key_check(cfg, context=ctx_id) @@ -511,8 +522,7 @@ def test_rss_context_out_of_order(cfg, ctx_cnt=4): """ require_ntuple(cfg) - - requested_ctx_cnt = ctx_cnt + require_context_cnt(cfg, 4) # Try to allocate more queues when necessary qcnt = len(_get_rx_cnts(cfg)) @@ -577,9 +587,6 @@ def test_rss_context_out_of_order(cfg, ctx_cnt=4): remove_ctx(-1) check_traffic() - if requested_ctx_cnt != ctx_cnt: - raise KsftSkipEx(f"Tested only {ctx_cnt} contexts, wanted {requested_ctx_cnt}") - def test_rss_context_overlap(cfg, other_ctx=0): """ @@ -588,6 +595,8 @@ def test_rss_context_overlap(cfg, other_ctx=0): """ require_ntuple(cfg) + if other_ctx: + require_context_cnt(cfg, 2) queue_cnt = len(_get_rx_cnts(cfg)) if queue_cnt < 4: @@ -649,6 +658,29 @@ def test_rss_context_overlap2(cfg): test_rss_context_overlap(cfg, True) +def test_flow_add_context_missing(cfg): + """ + Test that we are not allowed to add a rule pointing to an RSS context + which was never created. + """ + + require_ntuple(cfg) + + # Find a context which doesn't exist + for ctx_id in range(1, 100): + try: + get_rss(cfg, context=ctx_id) + except CmdExitFailure: + break + + with ksft_raises(CmdExitFailure) as cm: + flow = f"flow-type tcp{cfg.addr_ipver} dst-ip {cfg.addr} dst-port 1234 context {ctx_id}" + ntuple_id = ethtool_create(cfg, "-N", flow) + ethtool(f"-N {cfg.ifname} delete {ntuple_id}") + if cm.exception: + ksft_in('Invalid argument', cm.exception.cmd.stderr) + + def test_delete_rss_context_busy(cfg): """ Test that deletion returns -EBUSY when an rss context is being used @@ -717,6 +749,7 @@ def test_rss_ntuple_addition(cfg): def main() -> None: with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + cfg.context_cnt = None cfg.ethnl = EthtoolFamily() cfg.netdevnl = NetdevFamily() @@ -726,6 +759,7 @@ def main() -> None: test_rss_context_dump, test_rss_context_queue_reconfigure, test_rss_context_overlap, test_rss_context_overlap2, test_rss_context_out_of_order, test_rss_context4_create_with_cfg, + test_flow_add_context_missing, test_delete_rss_context_busy, test_rss_ntuple_addition], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py new file mode 100755 index 000000000000..53bb08cc29ec --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import multiprocessing +import socket +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge, cmd, fd_read_timeout +from lib.py import NetDrvEpEnv +from lib.py import EthtoolFamily, NetdevFamily +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import rand_port + + +def traffic(cfg, local_port, remote_port, ipver): + af_inet = socket.AF_INET if ipver == "4" else socket.AF_INET6 + sock = socket.socket(af_inet, socket.SOCK_DGRAM) + sock.bind(("", local_port)) + sock.connect((cfg.remote_addr_v[ipver], remote_port)) + tgt = f"{ipver}:[{cfg.addr_v[ipver]}]:{local_port},sourceport={remote_port}" + cmd("echo a | socat - UDP" + tgt, host=cfg.remote) + fd_read_timeout(sock.fileno(), 5) + return sock.getsockopt(socket.SOL_SOCKET, socket.SO_INCOMING_CPU) + + +def test_rss_input_xfrm(cfg, ipver): + """ + Test symmetric input_xfrm. + If symmetric RSS hash is configured, send traffic twice, swapping the + src/dst UDP ports, and verify that the same queue is receiving the traffic + in both cases (IPs are constant). + """ + + if multiprocessing.cpu_count() < 2: + raise KsftSkipEx("Need at least two CPUs to test symmetric RSS hash") + + input_xfrm = cfg.ethnl.rss_get( + {'header': {'dev-name': cfg.ifname}}).get('input_xfrm') + + # Check for symmetric xor/or-xor + if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2): + raise KsftSkipEx("Symmetric RSS hash not requested") + + cpus = set() + successful = 0 + for _ in range(100): + try: + port1 = rand_port(socket.SOCK_DGRAM) + port2 = rand_port(socket.SOCK_DGRAM) + cpu1 = traffic(cfg, port1, port2, ipver) + cpu2 = traffic(cfg, port2, port1, ipver) + cpus.update([cpu1, cpu2]) + ksft_eq( + cpu1, cpu2, comment=f"Received traffic on different cpus with ports ({port1 = }, {port2 = }) while symmetric hash is configured") + + successful += 1 + if successful == 10: + break + except: + continue + else: + raise KsftFailEx("Failed to run traffic") + + ksft_ge(len(cpus), 2, + comment=f"Received traffic on less than two cpus {cpus = }") + + +def test_rss_input_xfrm_ipv4(cfg): + cfg.require_ipver("4") + test_rss_input_xfrm(cfg, "4") + + +def test_rss_input_xfrm_ipv6(cfg): + cfg.require_ipver("6") + test_rss_input_xfrm(cfg, "6") + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + cfg.ethnl = EthtoolFamily() + cfg.netdevnl = NetdevFamily() + + ksft_run([test_rss_input_xfrm_ipv4, test_rss_input_xfrm_ipv6], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/tso.py b/tools/testing/selftests/drivers/net/hw/tso.py new file mode 100755 index 000000000000..e1ecb92f79d9 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/tso.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Run the tools/testing/selftests/net/csum testsuite.""" + +import fcntl +import socket +import struct +import termios +import time + +from lib.py import ksft_pr, ksft_run, ksft_exit, KsftSkipEx, KsftXfailEx +from lib.py import ksft_eq, ksft_ge, ksft_lt +from lib.py import EthtoolFamily, NetdevFamily, NetDrvEpEnv +from lib.py import bkg, cmd, defer, ethtool, ip, rand_port, wait_port_listen + + +def sock_wait_drain(sock, max_wait=1000): + """Wait for all pending write data on the socket to get ACKed.""" + for _ in range(max_wait): + one = b'\0' * 4 + outq = fcntl.ioctl(sock.fileno(), termios.TIOCOUTQ, one) + outq = struct.unpack("I", outq)[0] + if outq == 0: + break + time.sleep(0.01) + ksft_eq(outq, 0) + + +def tcp_sock_get_retrans(sock): + """Get the number of retransmissions for the TCP socket.""" + info = sock.getsockopt(socket.SOL_TCP, socket.TCP_INFO, 512) + return struct.unpack("I", info[100:104])[0] + + +def run_one_stream(cfg, ipver, remote_v4, remote_v6, should_lso): + cfg.require_cmd("socat", remote=True) + + port = rand_port() + listen_cmd = f"socat -{ipver} -t 2 -u TCP-LISTEN:{port},reuseport /dev/null,ignoreeof" + + with bkg(listen_cmd, host=cfg.remote) as nc: + wait_port_listen(port, host=cfg.remote) + + if ipver == "4": + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((remote_v4, port)) + else: + sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM) + sock.connect((remote_v6, port)) + + # Small send to make sure the connection is working. + sock.send("ping".encode()) + sock_wait_drain(sock) + + # Send 4MB of data, record the LSO packet count. + qstat_old = cfg.netnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + buf = b"0" * 1024 * 1024 * 4 + sock.send(buf) + sock_wait_drain(sock) + qstat_new = cfg.netnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0] + + # No math behind the 10 here, but try to catch cases where + # TCP falls back to non-LSO. + ksft_lt(tcp_sock_get_retrans(sock), 10) + sock.close() + + # Check that at least 90% of the data was sent as LSO packets. + # System noise may cause false negatives. Also header overheads + # will add up to 5% of extra packes... The check is best effort. + total_lso_wire = len(buf) * 0.90 // cfg.dev["mtu"] + total_lso_super = len(buf) * 0.90 // cfg.dev["tso_max_size"] + if should_lso: + if cfg.have_stat_super_count: + ksft_ge(qstat_new['tx-hw-gso-packets'] - + qstat_old['tx-hw-gso-packets'], + total_lso_super, + comment="Number of LSO super-packets with LSO enabled") + if cfg.have_stat_wire_count: + ksft_ge(qstat_new['tx-hw-gso-wire-packets'] - + qstat_old['tx-hw-gso-wire-packets'], + total_lso_wire, + comment="Number of LSO wire-packets with LSO enabled") + else: + if cfg.have_stat_super_count: + ksft_lt(qstat_new['tx-hw-gso-packets'] - + qstat_old['tx-hw-gso-packets'], + 15, comment="Number of LSO super-packets with LSO disabled") + if cfg.have_stat_wire_count: + ksft_lt(qstat_new['tx-hw-gso-wire-packets'] - + qstat_old['tx-hw-gso-wire-packets'], + 500, comment="Number of LSO wire-packets with LSO disabled") + + +def build_tunnel(cfg, outer_ipver, tun_info): + local_v4 = NetDrvEpEnv.nsim_v4_pfx + "1" + local_v6 = NetDrvEpEnv.nsim_v6_pfx + "1" + remote_v4 = NetDrvEpEnv.nsim_v4_pfx + "2" + remote_v6 = NetDrvEpEnv.nsim_v6_pfx + "2" + + local_addr = cfg.addr_v[outer_ipver] + remote_addr = cfg.remote_addr_v[outer_ipver] + + tun_type = tun_info[0] + tun_arg = tun_info[2] + ip(f"link add {tun_type}-ksft type {tun_type} {tun_arg} local {local_addr} remote {remote_addr} dev {cfg.ifname}") + defer(ip, f"link del {tun_type}-ksft") + ip(f"link set dev {tun_type}-ksft up") + ip(f"addr add {local_v4}/24 dev {tun_type}-ksft") + ip(f"addr add {local_v6}/64 dev {tun_type}-ksft") + + ip(f"link add {tun_type}-ksft type {tun_type} {tun_arg} local {remote_addr} remote {local_addr} dev {cfg.remote_ifname}", + host=cfg.remote) + defer(ip, f"link del {tun_type}-ksft", host=cfg.remote) + ip(f"link set dev {tun_type}-ksft up", host=cfg.remote) + ip(f"addr add {remote_v4}/24 dev {tun_type}-ksft", host=cfg.remote) + ip(f"addr add {remote_v6}/64 dev {tun_type}-ksft", host=cfg.remote) + + return remote_v4, remote_v6 + + +def test_builder(name, cfg, outer_ipver, feature, tun=None, inner_ipver=None): + """Construct specific tests from the common template.""" + def f(cfg): + cfg.require_ipver(outer_ipver) + + if not cfg.have_stat_super_count and \ + not cfg.have_stat_wire_count: + raise KsftSkipEx(f"Device does not support LSO queue stats") + + ipver = outer_ipver + if tun: + remote_v4, remote_v6 = build_tunnel(cfg, ipver, tun) + ipver = inner_ipver + else: + remote_v4 = cfg.remote_addr_v["4"] + remote_v6 = cfg.remote_addr_v["6"] + + tun_partial = tun and tun[1] + # Tunnel which can silently fall back to gso-partial + has_gso_partial = tun and 'tx-gso-partial' in cfg.features + + # For TSO4 via partial we need mangleid + if ipver == "4" and feature in cfg.partial_features: + ksft_pr("Testing with mangleid enabled") + if 'tx-tcp-mangleid-segmentation' not in cfg.features: + ethtool(f"-K {cfg.ifname} tx-tcp-mangleid-segmentation on") + defer(ethtool, f"-K {cfg.ifname} tx-tcp-mangleid-segmentation off") + + # First test without the feature enabled. + ethtool(f"-K {cfg.ifname} {feature} off") + if has_gso_partial: + ethtool(f"-K {cfg.ifname} tx-gso-partial off") + run_one_stream(cfg, ipver, remote_v4, remote_v6, should_lso=False) + + # Now test with the feature enabled. + # For compatible tunnels only - just GSO partial, not specific feature. + if has_gso_partial: + ethtool(f"-K {cfg.ifname} tx-gso-partial on") + run_one_stream(cfg, ipver, remote_v4, remote_v6, + should_lso=tun_partial) + + # Full feature enabled. + if feature in cfg.features: + ethtool(f"-K {cfg.ifname} {feature} on") + run_one_stream(cfg, ipver, remote_v4, remote_v6, should_lso=True) + else: + raise KsftXfailEx(f"Device does not support {feature}") + + f.__name__ = name + ((outer_ipver + "_") if tun else "") + "ipv" + inner_ipver + return f + + +def query_nic_features(cfg) -> None: + """Query and cache the NIC features.""" + cfg.have_stat_super_count = False + cfg.have_stat_wire_count = False + + cfg.features = set() + features = cfg.ethnl.features_get({"header": {"dev-index": cfg.ifindex}}) + for f in features["active"]["bits"]["bit"]: + cfg.features.add(f["name"]) + + # Check which features are supported via GSO partial + cfg.partial_features = set() + if 'tx-gso-partial' in cfg.features: + ethtool(f"-K {cfg.ifname} tx-gso-partial off") + + no_partial = set() + features = cfg.ethnl.features_get({"header": {"dev-index": cfg.ifindex}}) + for f in features["active"]["bits"]["bit"]: + no_partial.add(f["name"]) + cfg.partial_features = cfg.features - no_partial + ethtool(f"-K {cfg.ifname} tx-gso-partial on") + + stats = cfg.netnl.qstats_get({"ifindex": cfg.ifindex}, dump=True) + if stats: + if 'tx-hw-gso-packets' in stats[0]: + ksft_pr("Detected qstat for LSO super-packets") + cfg.have_stat_super_count = True + if 'tx-hw-gso-wire-packets' in stats[0]: + ksft_pr("Detected qstat for LSO wire-packets") + cfg.have_stat_wire_count = True + + +def main() -> None: + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + cfg.ethnl = EthtoolFamily() + cfg.netnl = NetdevFamily() + + query_nic_features(cfg) + + test_info = ( + # name, v4/v6 ethtool_feature tun:(type, partial, args) + ("", "4", "tx-tcp-segmentation", None), + ("", "6", "tx-tcp6-segmentation", None), + ("vxlan", "", "tx-udp_tnl-segmentation", ("vxlan", True, "id 100 dstport 4789 noudpcsum")), + ("vxlan_csum", "", "tx-udp_tnl-csum-segmentation", ("vxlan", False, "id 100 dstport 4789 udpcsum")), + ("gre", "4", "tx-gre-segmentation", ("ipgre", False, "")), + ("gre", "6", "tx-gre-segmentation", ("ip6gre", False, "")), + ) + + cases = [] + for outer_ipver in ["4", "6"]: + for info in test_info: + # Skip if test which only works for a specific IP version + if info[1] and outer_ipver != info[1]: + continue + + cases.append(test_builder(info[0], cfg, outer_ipver, info[2], + tun=info[3], inner_ipver="4")) + if info[3]: + cases.append(test_builder(info[0], cfg, outer_ipver, info[2], + tun=info[3], inner_ipver="6")) + + ksft_run(cases=cases, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/hw/xdp_dummy.bpf.c b/tools/testing/selftests/drivers/net/hw/xdp_dummy.bpf.c new file mode 100644 index 000000000000..d988b2e0cee8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/xdp_dummy.bpf.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define KBUILD_MODNAME "xdp_dummy" +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +SEC("xdp") +int xdp_dummy_prog(struct xdp_md *ctx) +{ + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/drivers/net/lib/py/env.py b/tools/testing/selftests/drivers/net/lib/py/env.py index 987e452d3a45..fd4d674e6c72 100644 --- a/tools/testing/selftests/drivers/net/lib/py/env.py +++ b/tools/testing/selftests/drivers/net/lib/py/env.py @@ -10,42 +10,68 @@ from lib.py import NetNS, NetdevSimDev from .remote import Remote -def _load_env_file(src_path): - env = os.environ.copy() +class NetDrvEnvBase: + """ + Base class for a NIC / host envirnoments + """ + def __init__(self, src_path): + self.src_path = src_path + self.env = self._load_env_file() - src_dir = Path(src_path).parent.resolve() - if not (src_dir / "net.config").exists(): + def rpath(self, path): + """ + Get an absolute path to a file based on a path relative to the directory + containing the test which constructed env. + + For example, if the test.py is in the same directory as + a binary (built from helper.c), the test can use env.rpath("helper") + to get the absolute path to the binary + """ + src_dir = Path(self.src_path).parent.resolve() + return (src_dir / path).as_posix() + + def _load_env_file(self): + env = os.environ.copy() + + src_dir = Path(self.src_path).parent.resolve() + if not (src_dir / "net.config").exists(): + return ksft_setup(env) + + with open((src_dir / "net.config").as_posix(), 'r') as fp: + for line in fp.readlines(): + full_file = line + # Strip comments + pos = line.find("#") + if pos >= 0: + line = line[:pos] + line = line.strip() + if not line: + continue + pair = line.split('=', maxsplit=1) + if len(pair) != 2: + raise Exception("Can't parse configuration line:", full_file) + env[pair[0]] = pair[1] return ksft_setup(env) - with open((src_dir / "net.config").as_posix(), 'r') as fp: - for line in fp.readlines(): - full_file = line - # Strip comments - pos = line.find("#") - if pos >= 0: - line = line[:pos] - line = line.strip() - if not line: - continue - pair = line.split('=', maxsplit=1) - if len(pair) != 2: - raise Exception("Can't parse configuration line:", full_file) - env[pair[0]] = pair[1] - return ksft_setup(env) - - -class NetDrvEnv: + +class NetDrvEnv(NetDrvEnvBase): """ Class for a single NIC / host env, with no remote end """ - def __init__(self, src_path, **kwargs): - self._ns = None + def __init__(self, src_path, nsim_test=None, **kwargs): + super().__init__(src_path) - self.env = _load_env_file(src_path) + self._ns = None if 'NETIF' in self.env: - self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + if nsim_test is True: + raise KsftXfailEx("Test only works on netdevsim") + + self.dev = ip("-d link show dev " + self.env['NETIF'], json=True)[0] else: + if nsim_test is False: + raise KsftXfailEx("Test does not work on netdevsim") + self._ns = NetdevSimDev(**kwargs) self.dev = self._ns.nsims[0].dev self.ifname = self.dev['ifname'] @@ -68,7 +94,7 @@ class NetDrvEnv: self._ns = None -class NetDrvEpEnv: +class NetDrvEpEnv(NetDrvEnvBase): """ Class for an environment with a local device and "remote endpoint" which can be used to send traffic in. @@ -82,8 +108,7 @@ class NetDrvEpEnv: nsim_v6_pfx = "2001:db8::" def __init__(self, src_path, nsim_test=None): - - self.env = _load_env_file(src_path) + super().__init__(src_path) self._stats_settle_time = None @@ -94,17 +119,20 @@ class NetDrvEpEnv: self._ns = None self._ns_peer = None + self.addr_v = { "4": None, "6": None } + self.remote_addr_v = { "4": None, "6": None } + if "NETIF" in self.env: if nsim_test is True: raise KsftXfailEx("Test only works on netdevsim") self._check_env() - self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0] + self.dev = ip("-d link show dev " + self.env['NETIF'], json=True)[0] - self.v4 = self.env.get("LOCAL_V4") - self.v6 = self.env.get("LOCAL_V6") - self.remote_v4 = self.env.get("REMOTE_V4") - self.remote_v6 = self.env.get("REMOTE_V6") + self.addr_v["4"] = self.env.get("LOCAL_V4") + self.addr_v["6"] = self.env.get("LOCAL_V6") + self.remote_addr_v["4"] = self.env.get("REMOTE_V4") + self.remote_addr_v["6"] = self.env.get("REMOTE_V6") kind = self.env["REMOTE_TYPE"] args = self.env["REMOTE_ARGS"] else: @@ -115,26 +143,29 @@ class NetDrvEpEnv: self.dev = self._ns.nsims[0].dev - self.v4 = self.nsim_v4_pfx + "1" - self.v6 = self.nsim_v6_pfx + "1" - self.remote_v4 = self.nsim_v4_pfx + "2" - self.remote_v6 = self.nsim_v6_pfx + "2" + self.addr_v["4"] = self.nsim_v4_pfx + "1" + self.addr_v["6"] = self.nsim_v6_pfx + "1" + self.remote_addr_v["4"] = self.nsim_v4_pfx + "2" + self.remote_addr_v["6"] = self.nsim_v6_pfx + "2" kind = "netns" args = self._netns.name self.remote = Remote(kind, args, src_path) - self.addr = self.v6 if self.v6 else self.v4 - self.remote_addr = self.remote_v6 if self.remote_v6 else self.remote_v4 + self.addr_ipver = "6" if self.addr_v["6"] else "4" + self.addr = self.addr_v[self.addr_ipver] + self.remote_addr = self.remote_addr_v[self.addr_ipver] - self.addr_ipver = "6" if self.v6 else "4" # Bracketed addresses, some commands need IPv6 to be inside [] - self.baddr = f"[{self.v6}]" if self.v6 else self.v4 - self.remote_baddr = f"[{self.remote_v6}]" if self.remote_v6 else self.remote_v4 + self.baddr = f"[{self.addr_v['6']}]" if self.addr_v["6"] else self.addr_v["4"] + self.remote_baddr = f"[{self.remote_addr_v['6']}]" if self.remote_addr_v["6"] else self.remote_addr_v["4"] self.ifname = self.dev['ifname'] self.ifindex = self.dev['ifindex'] + # resolve remote interface name + self.remote_ifname = self.resolve_remote_ifc() + self._required_cmd = {} def create_local(self): @@ -181,6 +212,18 @@ class NetDrvEpEnv: raise Exception("Invalid environment, missing configuration:", missing, "Please see tools/testing/selftests/drivers/net/README.rst") + def resolve_remote_ifc(self): + v4 = v6 = None + if self.remote_addr_v["4"]: + v4 = ip("addr show to " + self.remote_addr_v["4"], json=True, host=self.remote) + if self.remote_addr_v["6"]: + v6 = ip("addr show to " + self.remote_addr_v["6"], json=True, host=self.remote) + if v4 and v6 and v4[0]["ifname"] != v6[0]["ifname"]: + raise Exception("Can't resolve remote interface name, v4 and v6 don't match") + if (v4 and len(v4) > 1) or (v6 and len(v6) > 1): + raise Exception("Can't resolve remote interface name, multiple interfaces match") + return v6[0]["ifname"] if v6 else v4[0]["ifname"] + def __enter__(self): return self @@ -204,13 +247,9 @@ class NetDrvEpEnv: del self.remote self.remote = None - def require_v4(self): - if not self.v4 or not self.remote_v4: - raise KsftSkipEx("Test requires IPv4 connectivity") - - def require_v6(self): - if not self.v6 or not self.remote_v6: - raise KsftSkipEx("Test requires IPv6 connectivity") + def require_ipver(self, ipver): + if not self.addr_v[ipver] or not self.remote_addr_v[ipver]: + raise KsftSkipEx(f"Test requires IPv{ipver} connectivity") def _require_cmd(self, comm, key, host=None): cached = self._required_cmd.get(comm, {}) diff --git a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh index 3acaba41ac7b..3c96b022954d 100644 --- a/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh +++ b/tools/testing/selftests/drivers/net/lib/sh/lib_netcons.sh @@ -110,6 +110,13 @@ function create_dynamic_target() { echo 1 > "${NETCONS_PATH}"/enabled } +# Do not append the release to the header of the message +function disable_release_append() { + echo 0 > "${NETCONS_PATH}"/enabled + echo 0 > "${NETCONS_PATH}"/release + echo 1 > "${NETCONS_PATH}"/enabled +} + function cleanup() { local NSIM_DEV_SYS_DEL="/sys/bus/netdevsim/del_device" @@ -223,3 +230,20 @@ function check_for_dependencies() { exit "${ksft_skip}" fi } + +function check_for_taskset() { + if ! which taskset > /dev/null ; then + echo "SKIP: taskset(1) is not available" >&2 + exit "${ksft_skip}" + fi +} + +# This is necessary if running multiple tests in a row +function pkill_socat() { + PROCESS_NAME="socat UDP-LISTEN:6666,fork ${OUTPUT_FILE}" + # socat runs under timeout(1), kill it if it is still alive + # do not fail if socat doesn't exist anymore + set +e + pkill -f "${PROCESS_NAME}" + set -e +} diff --git a/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh new file mode 100755 index 000000000000..4a71e01a230c --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_fragmented_msg.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# Test netconsole's message fragmentation functionality. +# +# When a message exceeds the maximum packet size, netconsole splits it into +# multiple fragments for transmission. This test verifies: +# - Correct fragmentation of large messages +# - Proper reassembly of fragments at the receiver +# - Preservation of userdata across fragments +# - Behavior with and without kernel release version appending +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# The content of kmsg will be save to the following file +OUTPUT_FILE="/tmp/${TARGET}" + +# set userdata to a long value. In this case, it is "1-2-3-4...50-" +USERDATA_VALUE=$(printf -- '%.2s-' {1..60}) + +# Convert the header string in a regexp, so, we can remove +# the second header as well. +# A header looks like "13,468,514729715,-,ncfrag=0/1135;". If +# release is appended, you might find something like:L +# "6.13.0-04048-g4f561a87745a,13,468,514729715,-,ncfrag=0/1135;" +function header_to_regex() { + # header is everything before ; + local HEADER="${1}" + REGEX=$(echo "${HEADER}" | cut -d'=' -f1) + echo "${REGEX}=[0-9]*\/[0-9]*;" +} + +# We have two headers in the message. Remove both to get the full message, +# and extract the full message. +function extract_msg() { + local MSGFILE="${1}" + # Extract the header, which is the very first thing that arrives in the + # first list. + HEADER=$(sed -n '1p' "${MSGFILE}" | cut -d';' -f1) + HEADER_REGEX=$(header_to_regex "${HEADER}") + + # Remove the two headers from the received message + # This will return the message without any header, similarly to what + # was sent. + sed "s/""${HEADER_REGEX}""//g" "${MSGFILE}" +} + +# Validate the message, which has two messages glued together. +# unwrap them to make sure all the characters were transmitted. +# File will look like the following: +# 13,468,514729715,-,ncfrag=0/1135;<message> +# key=<part of key>-13,468,514729715,-,ncfrag=967/1135;<rest of the key> +function validate_fragmented_result() { + # Discard the netconsole headers, and assemble the full message + RCVMSG=$(extract_msg "${1}") + + # check for the main message + if ! echo "${RCVMSG}" | grep -q "${MSG}"; then + echo "Message body doesn't match." >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + + # check userdata + if ! echo "${RCVMSG}" | grep -q "${USERDATA_VALUE}"; then + echo "message userdata doesn't match" >&2 + echo "msg received=" "${RCVMSG}" >&2 + exit "${ksft_fail}" + fi + # test passed. hooray +} + +# Check for basic system dependency and exit if not found +check_for_dependencies +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target +# Set userdata "key" with the "value" value +set_user_data + + +# TEST 1: Send message and userdata. They will fragment +# ======= +MSG=$(printf -- 'MSG%.3s=' {1..150}) + +# Listen for netconsole port inside the namespace and destination interface +listen_port_and_save_to "${OUTPUT_FILE}" & +# Wait for socat to start and listen to the port. +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +# Send the message +echo "${MSG}: ${TARGET}" > /dev/kmsg +# Wait until socat saves the file to disk +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +# Check if the message was not corrupted +validate_fragmented_result "${OUTPUT_FILE}" + +# TEST 2: Test with smaller message, and without release appended +# ======= +MSG=$(printf -- 'FOOBAR%.3s=' {1..100}) +# Let's disable release and test again. +disable_release_append + +listen_port_and_save_to "${OUTPUT_FILE}" & +wait_local_port_listen "${NAMESPACE}" "${PORT}" udp +echo "${MSG}: ${TARGET}" > /dev/kmsg +busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +validate_fragmented_result "${OUTPUT_FILE}" +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/netcons_sysdata.sh b/tools/testing/selftests/drivers/net/netcons_sysdata.sh new file mode 100755 index 000000000000..a737e377bf08 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netcons_sysdata.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: GPL-2.0 + +# A test that makes sure that sysdata runtime CPU data is properly set +# when a message is sent. +# +# There are 3 different tests, every time sent using a random CPU. +# - Test #1 +# * Only enable cpu_nr sysdata feature. +# - Test #2 +# * Keep cpu_nr sysdata feature enable and enable userdata. +# - Test #3 +# * keep userdata enabled, and disable sysdata cpu_nr feature. +# +# Author: Breno Leitao <leitao@debian.org> + +set -euo pipefail + +SCRIPTDIR=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") + +source "${SCRIPTDIR}"/lib/sh/lib_netcons.sh + +# Enable the sysdata cpu_nr feature +function set_cpu_nr() { + if [[ ! -f "${NETCONS_PATH}/userdata/cpu_nr_enabled" ]] + then + echo "Populate CPU configfs path not available in ${NETCONS_PATH}/userdata/cpu_nr_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" +} + +# Enable the taskname to be appended to sysdata +function set_taskname() { + if [[ ! -f "${NETCONS_PATH}/userdata/taskname_enabled" ]] + then + echo "Not able to enable taskname sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/taskname_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/taskname_enabled" +} + +# Enable the release to be appended to sysdata +function set_release() { + if [[ ! -f "${NETCONS_PATH}/userdata/release_enabled" ]] + then + echo "Not able to enable release sysdata append. Configfs not available in ${NETCONS_PATH}/userdata/release_enabled" >&2 + exit "${ksft_skip}" + fi + + echo 1 > "${NETCONS_PATH}/userdata/release_enabled" +} + +# Disable the sysdata cpu_nr feature +function unset_cpu_nr() { + echo 0 > "${NETCONS_PATH}/userdata/cpu_nr_enabled" +} + +# Once called, taskname=<..> will not be appended anymore +function unset_taskname() { + echo 0 > "${NETCONS_PATH}/userdata/taskname_enabled" +} + +function unset_release() { + echo 0 > "${NETCONS_PATH}/userdata/release_enabled" +} + +# Test if MSG contains sysdata +function validate_sysdata() { + # OUTPUT_FILE will contain something like: + # 6.11.1-0_fbk0_rc13_509_g30d75cea12f7,13,1822,115075213798,-;netconsole selftest: netcons_gtJHM + # userdatakey=userdatavalue + # cpu=X + # taskname=<taskname> + + # Echo is what this test uses to create the message. See runtest() + # function + SENDER="echo" + + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then + echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + # Check if cpu=XX exists in the file and matches the one used + # in taskset(1) + if ! grep -q "cpu=${CPU}\+" "${OUTPUT_FILE}"; then + echo "FAIL: 'cpu=${CPU}' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "taskname=${SENDER}" "${OUTPUT_FILE}"; then + echo "FAIL: 'taskname=echo' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + rm "${OUTPUT_FILE}" + pkill_socat +} + +function validate_release() { + RELEASE=$(uname -r) + + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "release=${RELEASE}" "${OUTPUT_FILE}"; then + echo "FAIL: 'release=${RELEASE}' not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi +} + +# Test if MSG content exists in OUTPUT_FILE but no `cpu=` and `taskname=` +# strings +function validate_no_sysdata() { + if [ ! -f "$OUTPUT_FILE" ]; then + echo "FAIL: File was not generated." >&2 + exit "${ksft_fail}" + fi + + if ! grep -q "${MSG}" "${OUTPUT_FILE}"; then + echo "FAIL: ${MSG} not found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "cpu=" "${OUTPUT_FILE}"; then + echo "FAIL: 'cpu= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "taskname=" "${OUTPUT_FILE}"; then + echo "FAIL: 'taskname= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + if grep -q "release=" "${OUTPUT_FILE}"; then + echo "FAIL: 'release= found in ${OUTPUT_FILE}" >&2 + cat "${OUTPUT_FILE}" >&2 + exit "${ksft_fail}" + fi + + rm "${OUTPUT_FILE}" +} + +# Start socat, send the message and wait for the file to show up in the file +# system +function runtest { + # Listen for netconsole port inside the namespace and destination + # interface + listen_port_and_save_to "${OUTPUT_FILE}" & + # Wait for socat to start and listen to the port. + wait_local_port_listen "${NAMESPACE}" "${PORT}" udp + # Send the message + taskset -c "${CPU}" echo "${MSG}: ${TARGET}" > /dev/kmsg + # Wait until socat saves the file to disk + busywait "${BUSYWAIT_TIMEOUT}" test -s "${OUTPUT_FILE}" +} + +# ========== # +# Start here # +# ========== # + +modprobe netdevsim 2> /dev/null || true +modprobe netconsole 2> /dev/null || true + +# Check for basic system dependency and exit if not found +check_for_dependencies +# This test also depends on taskset(1). Check for it before starting the test +check_for_taskset + +# Set current loglevel to KERN_INFO(6), and default to KERN_NOTICE(5) +echo "6 5" > /proc/sys/kernel/printk +# Remove the namespace, interfaces and netconsole target on exit +trap cleanup EXIT +# Create one namespace and two interfaces +set_network +# Create a dynamic target for netconsole +create_dynamic_target + +#==================================================== +# TEST #1 +# Send message from a random CPU +#==================================================== +# Random CPU in the system +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_1" +MSG="Test #1 from CPU${CPU}" +# Enable the auto population of cpu_nr +set_cpu_nr +# Enable taskname to be appended to sysdata +set_taskname +set_release +runtest +# Make sure the message was received in the dst part +# and exit +validate_release +validate_sysdata + +#==================================================== +# TEST #2 +# This test now adds userdata together with sysdata +# =================================================== +# Get a new random CPU +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_2" +MSG="Test #2 from CPU${CPU}" +set_user_data +runtest +validate_release +validate_sysdata + +# =================================================== +# TEST #3 +# Unset all sysdata, fail if any userdata is set +# =================================================== +CPU=$((RANDOM % $(nproc))) +OUTPUT_FILE="/tmp/${TARGET}_3" +MSG="Test #3 from CPU${CPU}" +unset_cpu_nr +unset_taskname +unset_release +runtest +# At this time, cpu= shouldn't be present in the msg +validate_no_sysdata + +exit "${ksft_pass}" diff --git a/tools/testing/selftests/drivers/net/ping.py b/tools/testing/selftests/drivers/net/ping.py index fc69bfcc37c4..93120e86e102 100755 --- a/tools/testing/selftests/drivers/net/ping.py +++ b/tools/testing/selftests/drivers/net/ping.py @@ -13,20 +13,20 @@ remote_ifname="" no_sleep=False def _test_v4(cfg) -> None: - cfg.require_v4() + cfg.require_ipver("4") - cmd(f"ping -c 1 -W0.5 {cfg.remote_v4}") - cmd(f"ping -c 1 -W0.5 {cfg.v4}", host=cfg.remote) - cmd(f"ping -s 65000 -c 1 -W0.5 {cfg.remote_v4}") - cmd(f"ping -s 65000 -c 1 -W0.5 {cfg.v4}", host=cfg.remote) + cmd("ping -c 1 -W0.5 " + cfg.remote_addr_v["4"]) + cmd("ping -c 1 -W0.5 " + cfg.addr_v["4"], host=cfg.remote) + cmd("ping -s 65000 -c 1 -W0.5 " + cfg.remote_addr_v["4"]) + cmd("ping -s 65000 -c 1 -W0.5 " + cfg.addr_v["4"], host=cfg.remote) def _test_v6(cfg) -> None: - cfg.require_v6() + cfg.require_ipver("6") - cmd(f"ping -c 1 -W5 {cfg.remote_v6}") - cmd(f"ping -c 1 -W5 {cfg.v6}", host=cfg.remote) - cmd(f"ping -s 65000 -c 1 -W0.5 {cfg.remote_v6}") - cmd(f"ping -s 65000 -c 1 -W0.5 {cfg.v6}", host=cfg.remote) + cmd("ping -c 1 -W5 " + cfg.remote_addr_v["6"]) + cmd("ping -c 1 -W5 " + cfg.addr_v["6"], host=cfg.remote) + cmd("ping -s 65000 -c 1 -W0.5 " + cfg.remote_addr_v["6"]) + cmd("ping -s 65000 -c 1 -W0.5 " + cfg.addr_v["6"], host=cfg.remote) def _test_tcp(cfg) -> None: cfg.require_cmd("socat", remote=True) @@ -126,7 +126,7 @@ def get_interface_info(cfg) -> None: global remote_ifname global no_sleep - remote_info = cmd(f"ip -4 -o addr show to {cfg.remote_v4} | awk '{{print $2}}'", shell=True, host=cfg.remote).stdout + remote_info = cmd(f"ip -4 -o addr show to {cfg.remote_addr_v['4']} | awk '{{print $2}}'", shell=True, host=cfg.remote).stdout remote_ifname = remote_info.rstrip('\n') if remote_ifname == "": raise KsftFailEx('Can not get remote interface') diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 8a518905a9f9..cae923f84f69 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -2,13 +2,15 @@ # SPDX-License-Identifier: GPL-2.0 from lib.py import ksft_disruptive, ksft_exit, ksft_run -from lib.py import ksft_eq, ksft_raises, KsftSkipEx +from lib.py import ksft_eq, ksft_not_in, ksft_raises, KsftSkipEx, KsftFailEx from lib.py import EthtoolFamily, NetdevFamily, NlError from lib.py import NetDrvEnv -from lib.py import cmd, defer, ip +from lib.py import bkg, cmd, defer, ip import errno import glob - +import os +import socket +import struct def sys_get_queues(ifname, qtype='rx') -> int: folders = glob.glob(f'/sys/class/net/{ifname}/queues/{qtype}-*') @@ -22,6 +24,40 @@ def nl_get_queues(cfg, nl, qtype='rx'): return None +def check_xsk(cfg, nl, xdp_queue_id=0) -> None: + # Probe for support + xdp = cmd(cfg.rpath("xdp_helper") + ' - -', fail=False) + if xdp.ret == 255: + raise KsftSkipEx('AF_XDP unsupported') + elif xdp.ret > 0: + raise KsftFailEx('unable to create AF_XDP socket') + + with bkg(f'{cfg.rpath("xdp_helper")} {cfg.ifindex} {xdp_queue_id}', + ksft_wait=3): + + rx = tx = False + + queues = nl.queue_get({'ifindex': cfg.ifindex}, dump=True) + if not queues: + raise KsftSkipEx("Netlink reports no queues") + + for q in queues: + if q['id'] == 0: + if q['type'] == 'rx': + rx = True + if q['type'] == 'tx': + tx = True + + ksft_eq(q.get('xsk', None), {}, + comment="xsk attr on queue we configured") + else: + ksft_not_in('xsk', q, + comment="xsk attr on queue we didn't configure") + + ksft_eq(rx, True) + ksft_eq(tx, True) + + def get_queues(cfg, nl) -> None: snl = NetdevFamily(recv_size=4096) @@ -80,7 +116,8 @@ def check_down(cfg, nl) -> None: def main() -> None: with NetDrvEnv(__file__, queue_count=100) as cfg: - ksft_run([get_queues, addremove_queues, check_down], args=(cfg, NetdevFamily())) + ksft_run([get_queues, addremove_queues, check_down, check_xsk], + args=(cfg, NetdevFamily())) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/xdp_helper.c b/tools/testing/selftests/drivers/net/xdp_helper.c new file mode 100644 index 000000000000..aeed25914104 --- /dev/null +++ b/tools/testing/selftests/drivers/net/xdp_helper.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <linux/if_xdp.h> +#include <linux/if_link.h> +#include <net/if.h> +#include <inttypes.h> + +#define UMEM_SZ (1U << 16) +#define NUM_DESC (UMEM_SZ / 2048) + +/* Move this to a common header when reused! */ +static void ksft_ready(void) +{ + const char msg[7] = "ready\n"; + char *env_str; + int fd; + + env_str = getenv("KSFT_READY_FD"); + if (env_str) { + fd = atoi(env_str); + if (!fd) { + fprintf(stderr, "invalid KSFT_READY_FD = '%s'\n", + env_str); + return; + } + } else { + fd = STDOUT_FILENO; + } + + write(fd, msg, sizeof(msg)); + if (fd != STDOUT_FILENO) + close(fd); +} + +static void ksft_wait(void) +{ + char *env_str; + char byte; + int fd; + + env_str = getenv("KSFT_WAIT_FD"); + if (env_str) { + fd = atoi(env_str); + if (!fd) { + fprintf(stderr, "invalid KSFT_WAIT_FD = '%s'\n", + env_str); + return; + } + } else { + /* Not running in KSFT env, wait for input from STDIN instead */ + fd = STDIN_FILENO; + } + + read(fd, &byte, sizeof(byte)); + if (fd != STDIN_FILENO) + close(fd); +} + +/* this is a simple helper program that creates an XDP socket and does the + * minimum necessary to get bind() to succeed. + * + * this test program is not intended to actually process packets, but could be + * extended in the future if that is actually needed. + * + * it is used by queues.py to ensure the xsk netlinux attribute is set + * correctly. + */ +int main(int argc, char **argv) +{ + struct xdp_umem_reg umem_reg = { 0 }; + struct sockaddr_xdp sxdp = { 0 }; + int num_desc = NUM_DESC; + void *umem_area; + int ifindex; + int sock_fd; + int queue; + + if (argc != 3) { + fprintf(stderr, "Usage: %s ifindex queue_id\n", argv[0]); + return 1; + } + + sock_fd = socket(AF_XDP, SOCK_RAW, 0); + if (sock_fd < 0) { + perror("socket creation failed"); + /* if the kernel doesn't support AF_XDP, let the test program + * know with -1. All other error paths return 1. + */ + if (errno == EAFNOSUPPORT) + return -1; + return 1; + } + + /* "Probing mode", just checking if AF_XDP sockets are supported */ + if (!strcmp(argv[1], "-") && !strcmp(argv[2], "-")) { + printf("AF_XDP support detected\n"); + close(sock_fd); + return 0; + } + + ifindex = atoi(argv[1]); + queue = atoi(argv[2]); + + umem_area = mmap(NULL, UMEM_SZ, PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (umem_area == MAP_FAILED) { + perror("mmap failed"); + return 1; + } + + umem_reg.addr = (uintptr_t)umem_area; + umem_reg.len = UMEM_SZ; + umem_reg.chunk_size = 2048; + umem_reg.headroom = 0; + + setsockopt(sock_fd, SOL_XDP, XDP_UMEM_REG, &umem_reg, + sizeof(umem_reg)); + setsockopt(sock_fd, SOL_XDP, XDP_UMEM_FILL_RING, &num_desc, + sizeof(num_desc)); + setsockopt(sock_fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &num_desc, + sizeof(num_desc)); + setsockopt(sock_fd, SOL_XDP, XDP_RX_RING, &num_desc, sizeof(num_desc)); + + sxdp.sxdp_family = AF_XDP; + sxdp.sxdp_ifindex = ifindex; + sxdp.sxdp_queue_id = queue; + sxdp.sxdp_flags = 0; + + if (bind(sock_fd, (struct sockaddr *)&sxdp, sizeof(sxdp)) != 0) { + munmap(umem_area, UMEM_SZ); + perror("bind failed"); + close(sock_fd); + return 1; + } + + ksft_ready(); + ksft_wait(); + + /* parent program will write a byte to stdin when its ready for this + * helper to exit + */ + + close(sock_fd); + return 0; +} diff --git a/tools/testing/selftests/filesystems/mount-notify/.gitignore b/tools/testing/selftests/filesystems/mount-notify/.gitignore new file mode 100644 index 000000000000..82a4846cbc4b --- /dev/null +++ b/tools/testing/selftests/filesystems/mount-notify/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/*_test diff --git a/tools/testing/selftests/filesystems/mount-notify/Makefile b/tools/testing/selftests/filesystems/mount-notify/Makefile new file mode 100644 index 000000000000..10be0227b5ae --- /dev/null +++ b/tools/testing/selftests/filesystems/mount-notify/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) +TEST_GEN_PROGS := mount-notify_test + +include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c new file mode 100644 index 000000000000..4a2d5c454fd1 --- /dev/null +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -0,0 +1,516 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2025 Miklos Szeredi <miklos@szeredi.hu> + +#define _GNU_SOURCE +#include <fcntl.h> +#include <sched.h> +#include <stdio.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <linux/fanotify.h> +#include <unistd.h> +#include <sys/fanotify.h> +#include <sys/syscall.h> + +#include "../../kselftest_harness.h" +#include "../statmount/statmount.h" + +#ifndef FAN_MNT_ATTACH +struct fanotify_event_info_mnt { + struct fanotify_event_info_header hdr; + __u64 mnt_id; +}; +#define FAN_MNT_ATTACH 0x01000000 /* Mount was attached */ +#endif + +#ifndef FAN_MNT_DETACH +#define FAN_MNT_DETACH 0x02000000 /* Mount was detached */ +#endif + +#ifndef FAN_REPORT_MNT +#define FAN_REPORT_MNT 0x00004000 /* Report mount events */ +#endif + +#ifndef FAN_MARK_MNTNS +#define FAN_MARK_MNTNS 0x00000110 +#endif + +static uint64_t get_mnt_id(struct __test_metadata *const _metadata, + const char *path) +{ + struct statx sx; + + ASSERT_EQ(statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx), 0); + ASSERT_TRUE(!!(sx.stx_mask & STATX_MNT_ID_UNIQUE)); + return sx.stx_mnt_id; +} + +static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; + +FIXTURE(fanotify) { + int fan_fd; + char buf[256]; + unsigned int rem; + void *next; + char root_mntpoint[sizeof(root_mntpoint_templ)]; + int orig_root; + int ns_fd; + uint64_t root_id; +}; + +FIXTURE_SETUP(fanotify) +{ + int ret; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + self->ns_fd = open("/proc/self/ns/mnt", O_RDONLY); + ASSERT_GE(self->ns_fd, 0); + + ASSERT_EQ(mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL), 0); + + strcpy(self->root_mntpoint, root_mntpoint_templ); + ASSERT_NE(mkdtemp(self->root_mntpoint), NULL); + + self->orig_root = open("/", O_PATH | O_CLOEXEC); + ASSERT_GE(self->orig_root, 0); + + ASSERT_EQ(mount("tmpfs", self->root_mntpoint, "tmpfs", 0, NULL), 0); + + ASSERT_EQ(chroot(self->root_mntpoint), 0); + + ASSERT_EQ(chdir("/"), 0); + + ASSERT_EQ(mkdir("a", 0700), 0); + + ASSERT_EQ(mkdir("b", 0700), 0); + + self->root_id = get_mnt_id(_metadata, "/"); + ASSERT_NE(self->root_id, 0); + + self->fan_fd = fanotify_init(FAN_REPORT_MNT, 0); + ASSERT_GE(self->fan_fd, 0); + + ret = fanotify_mark(self->fan_fd, FAN_MARK_ADD | FAN_MARK_MNTNS, + FAN_MNT_ATTACH | FAN_MNT_DETACH, self->ns_fd, NULL); + ASSERT_EQ(ret, 0); + + self->rem = 0; +} + +FIXTURE_TEARDOWN(fanotify) +{ + ASSERT_EQ(self->rem, 0); + close(self->fan_fd); + + ASSERT_EQ(fchdir(self->orig_root), 0); + + ASSERT_EQ(chroot("."), 0); + + EXPECT_EQ(umount2(self->root_mntpoint, MNT_DETACH), 0); + EXPECT_EQ(chdir(self->root_mntpoint), 0); + EXPECT_EQ(chdir("/"), 0); + EXPECT_EQ(rmdir(self->root_mntpoint), 0); +} + +static uint64_t expect_notify(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + uint64_t *mask) +{ + struct fanotify_event_metadata *meta; + struct fanotify_event_info_mnt *mnt; + unsigned int thislen; + + if (!self->rem) { + ssize_t len = read(self->fan_fd, self->buf, sizeof(self->buf)); + ASSERT_GT(len, 0); + + self->rem = len; + self->next = (void *) self->buf; + } + + meta = self->next; + ASSERT_TRUE(FAN_EVENT_OK(meta, self->rem)); + + thislen = meta->event_len; + self->rem -= thislen; + self->next += thislen; + + *mask = meta->mask; + thislen -= sizeof(*meta); + + mnt = ((void *) meta) + meta->event_len - thislen; + + ASSERT_EQ(thislen, sizeof(*mnt)); + + return mnt->mnt_id; +} + +static void expect_notify_n(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + unsigned int n, uint64_t mask[], uint64_t mnts[]) +{ + unsigned int i; + + for (i = 0; i < n; i++) + mnts[i] = expect_notify(_metadata, self, &mask[i]); +} + +static uint64_t expect_notify_mask(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + uint64_t expect_mask) +{ + uint64_t mntid, mask; + + mntid = expect_notify(_metadata, self, &mask); + ASSERT_EQ(expect_mask, mask); + + return mntid; +} + + +static void expect_notify_mask_n(struct __test_metadata *const _metadata, + FIXTURE_DATA(fanotify) *self, + uint64_t mask, unsigned int n, uint64_t mnts[]) +{ + unsigned int i; + + for (i = 0; i < n; i++) + mnts[i] = expect_notify_mask(_metadata, self, mask); +} + +static void verify_mount_ids(struct __test_metadata *const _metadata, + const uint64_t list1[], const uint64_t list2[], + size_t num) +{ + unsigned int i, j; + + // Check that neither list has any duplicates + for (i = 0; i < num; i++) { + for (j = 0; j < num; j++) { + if (i != j) { + ASSERT_NE(list1[i], list1[j]); + ASSERT_NE(list2[i], list2[j]); + } + } + } + // Check that all list1 memebers can be found in list2. Together with + // the above it means that the list1 and list2 represent the same sets. + for (i = 0; i < num; i++) { + for (j = 0; j < num; j++) { + if (list1[i] == list2[j]) + break; + } + ASSERT_NE(j, num); + } +} + +static void check_mounted(struct __test_metadata *const _metadata, + const uint64_t mnts[], size_t num) +{ + ssize_t ret; + uint64_t *list; + + list = malloc((num + 1) * sizeof(list[0])); + ASSERT_NE(list, NULL); + + ret = listmount(LSMT_ROOT, 0, 0, list, num + 1, 0); + ASSERT_EQ(ret, num); + + verify_mount_ids(_metadata, mnts, list, num); + + free(list); +} + +static void setup_mount_tree(struct __test_metadata *const _metadata, + int log2_num) +{ + int ret, i; + + ret = mount("", "/", NULL, MS_SHARED, NULL); + ASSERT_EQ(ret, 0); + + for (i = 0; i < log2_num; i++) { + ret = mount("/", "/", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + } +} + +TEST_F(fanotify, bind) +{ + int ret; + uint64_t mnts[2] = { self->root_id }; + + ret = mount("/", "/", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + ASSERT_NE(mnts[0], mnts[1]); + + check_mounted(_metadata, mnts, 2); + + // Cleanup + uint64_t detach_id; + ret = umount("/"); + ASSERT_EQ(ret, 0); + + detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH); + ASSERT_EQ(detach_id, mnts[1]); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, move) +{ + int ret; + uint64_t mnts[2] = { self->root_id }; + uint64_t move_id; + + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + ASSERT_NE(mnts[0], mnts[1]); + + check_mounted(_metadata, mnts, 2); + + ret = move_mount(AT_FDCWD, "/a", AT_FDCWD, "/b", 0); + ASSERT_EQ(ret, 0); + + move_id = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH); + ASSERT_EQ(move_id, mnts[1]); + + // Cleanup + ret = umount("/b"); + ASSERT_EQ(ret, 0); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, propagate) +{ + const unsigned int log2_num = 4; + const unsigned int num = (1 << log2_num); + uint64_t mnts[num]; + + setup_mount_tree(_metadata, log2_num); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, num - 1, mnts + 1); + + mnts[0] = self->root_id; + check_mounted(_metadata, mnts, num); + + // Cleanup + int ret; + uint64_t mnts2[num]; + ret = umount2("/", MNT_DETACH); + ASSERT_EQ(ret, 0); + + ret = mount("", "/", NULL, MS_PRIVATE, NULL); + ASSERT_EQ(ret, 0); + + mnts2[0] = self->root_id; + expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, num - 1, mnts2 + 1); + verify_mount_ids(_metadata, mnts, mnts2, num); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, fsmount) +{ + int ret, fs, mnt; + uint64_t mnts[2] = { self->root_id }; + + fs = fsopen("tmpfs", 0); + ASSERT_GE(fs, 0); + + ret = fsconfig(fs, FSCONFIG_CMD_CREATE, 0, 0, 0); + ASSERT_EQ(ret, 0); + + mnt = fsmount(fs, 0, 0); + ASSERT_GE(mnt, 0); + + close(fs); + + ret = move_mount(mnt, "", AT_FDCWD, "/a", MOVE_MOUNT_F_EMPTY_PATH); + ASSERT_EQ(ret, 0); + + close(mnt); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + ASSERT_NE(mnts[0], mnts[1]); + + check_mounted(_metadata, mnts, 2); + + // Cleanup + uint64_t detach_id; + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + detach_id = expect_notify_mask(_metadata, self, FAN_MNT_DETACH); + ASSERT_EQ(detach_id, mnts[1]); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, reparent) +{ + uint64_t mnts[6] = { self->root_id }; + uint64_t dmnts[3]; + uint64_t masks[3]; + unsigned int i; + int ret; + + // Create setup with a[1] -> b[2] propagation + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("", "/a", NULL, MS_SHARED, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("/a", "/b", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("", "/b", NULL, MS_SLAVE, NULL); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1); + + check_mounted(_metadata, mnts, 3); + + // Mount on a[3], which is propagated to b[4] + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 3); + + check_mounted(_metadata, mnts, 5); + + // Mount on b[5], not propagated + ret = mount("/", "/b", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[5] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + + check_mounted(_metadata, mnts, 6); + + // Umount a[3], which is propagated to b[4], but not b[5] + // This will result in b[5] "falling" on b[2] + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + expect_notify_n(_metadata, self, 3, masks, dmnts); + verify_mount_ids(_metadata, mnts + 3, dmnts, 3); + + for (i = 0; i < 3; i++) { + if (dmnts[i] == mnts[5]) { + ASSERT_EQ(masks[i], FAN_MNT_ATTACH | FAN_MNT_DETACH); + } else { + ASSERT_EQ(masks[i], FAN_MNT_DETACH); + } + } + + mnts[3] = mnts[5]; + check_mounted(_metadata, mnts, 4); + + // Cleanup + ret = umount("/b"); + ASSERT_EQ(ret, 0); + + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + ret = umount("/b"); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 3, dmnts); + verify_mount_ids(_metadata, mnts + 1, dmnts, 3); + + check_mounted(_metadata, mnts, 1); +} + +TEST_F(fanotify, rmdir) +{ + uint64_t mnts[3] = { self->root_id }; + int ret; + + ret = mount("/", "/a", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + ret = mount("/", "/a/b", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH, 2, mnts + 1); + + check_mounted(_metadata, mnts, 3); + + ret = chdir("/a"); + ASSERT_EQ(ret, 0); + + ret = fork(); + ASSERT_GE(ret, 0); + + if (ret == 0) { + chdir("/"); + unshare(CLONE_NEWNS); + mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); + umount2("/a", MNT_DETACH); + // This triggers a detach in the other namespace + rmdir("/a"); + exit(0); + } + wait(NULL); + + expect_notify_mask_n(_metadata, self, FAN_MNT_DETACH, 2, mnts + 1); + check_mounted(_metadata, mnts, 1); + + // Cleanup + ret = chdir("/"); + ASSERT_EQ(ret, 0); +} + +TEST_F(fanotify, pivot_root) +{ + uint64_t mnts[3] = { self->root_id }; + uint64_t mnts2[3]; + int ret; + + ret = mount("tmpfs", "/a", "tmpfs", 0, NULL); + ASSERT_EQ(ret, 0); + + mnts[2] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + + ret = mkdir("/a/new", 0700); + ASSERT_EQ(ret, 0); + + ret = mkdir("/a/old", 0700); + ASSERT_EQ(ret, 0); + + ret = mount("/a", "/a/new", NULL, MS_BIND, NULL); + ASSERT_EQ(ret, 0); + + mnts[1] = expect_notify_mask(_metadata, self, FAN_MNT_ATTACH); + check_mounted(_metadata, mnts, 3); + + ret = syscall(SYS_pivot_root, "/a/new", "/a/new/old"); + ASSERT_EQ(ret, 0); + + expect_notify_mask_n(_metadata, self, FAN_MNT_ATTACH | FAN_MNT_DETACH, 2, mnts2); + verify_mount_ids(_metadata, mnts, mnts2, 2); + check_mounted(_metadata, mnts, 3); + + // Cleanup + ret = syscall(SYS_pivot_root, "/old", "/old/a/new"); + ASSERT_EQ(ret, 0); + + ret = umount("/a/new"); + ASSERT_EQ(ret, 0); + + ret = umount("/a"); + ASSERT_EQ(ret, 0); + + check_mounted(_metadata, mnts, 1); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c index 457cf76f3c5f..a3d8015897e9 100644 --- a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -3,6 +3,8 @@ #define _GNU_SOURCE #include <fcntl.h> +#include <linux/auto_dev-ioctl.h> +#include <linux/errno.h> #include <sched.h> #include <stdio.h> #include <string.h> @@ -146,4 +148,16 @@ TEST_F(iterate_mount_namespaces, iterate_backward) } } +TEST_F(iterate_mount_namespaces, nfs_valid_ioctl) +{ + ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_OPENMOUNT, NULL), 0); + ASSERT_EQ(errno, ENOTTY); + + ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_CLOSEMOUNT, NULL), 0); + ASSERT_EQ(errno, ENOTTY); + + ASSERT_NE(ioctl(self->fd_mnt_ns[0], AUTOFS_DEV_IOCTL_READY, NULL), 0); + ASSERT_EQ(errno, ENOTTY); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/overlayfs/Makefile b/tools/testing/selftests/filesystems/overlayfs/Makefile index e8d1adb021af..6c661232b3b5 100644 --- a/tools/testing/selftests/filesystems/overlayfs/Makefile +++ b/tools/testing/selftests/filesystems/overlayfs/Makefile @@ -1,7 +1,14 @@ # SPDX-License-Identifier: GPL-2.0 -TEST_GEN_PROGS := dev_in_maps set_layers_via_fds +CFLAGS += -Wall +CFLAGS += $(KHDR_INCLUDES) +LDLIBS += -lcap -CFLAGS := -Wall -Werror +LOCAL_HDRS += wrappers.h log.h + +TEST_GEN_PROGS := dev_in_maps +TEST_GEN_PROGS += set_layers_via_fds include ../../lib.mk + +$(OUTPUT)/set_layers_via_fds: ../utils.c diff --git a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c index 1d0ae785a667..5074e64e74a8 100644 --- a/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c +++ b/tools/testing/selftests/filesystems/overlayfs/set_layers_via_fds.c @@ -6,26 +6,40 @@ #include <sched.h> #include <stdio.h> #include <string.h> +#include <sys/socket.h> #include <sys/stat.h> +#include <sys/sysmacros.h> #include <sys/mount.h> #include <unistd.h> #include "../../kselftest_harness.h" +#include "../../pidfd/pidfd.h" #include "log.h" +#include "../utils.h" #include "wrappers.h" FIXTURE(set_layers_via_fds) { + int pidfd; }; FIXTURE_SETUP(set_layers_via_fds) { - ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0); + self->pidfd = -EBADF; + EXPECT_EQ(mkdir("/set_layers_via_fds", 0755), 0); + EXPECT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0); } FIXTURE_TEARDOWN(set_layers_via_fds) { + if (self->pidfd >= 0) { + EXPECT_EQ(sys_pidfd_send_signal(self->pidfd, SIGKILL, NULL, 0), 0); + EXPECT_EQ(close(self->pidfd), 0); + } umount2("/set_layers_via_fds", 0); - ASSERT_EQ(rmdir("/set_layers_via_fds"), 0); + EXPECT_EQ(rmdir("/set_layers_via_fds"), 0); + + umount2("/set_layers_via_fds_tmpfs", 0); + EXPECT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0); } TEST_F(set_layers_via_fds, set_layers_via_fds) @@ -214,4 +228,493 @@ TEST_F(set_layers_via_fds, set_500_layers_via_fds) ASSERT_EQ(close(fd_overlay), 0); } +TEST_F(set_layers_via_fds, set_override_creds) +{ + int fd_context, fd_tmpfs, fd_overlay; + int layer_fds[] = { [0 ... 3] = -EBADF }; + pid_t pid; + int pidfd; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + + layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY); + ASSERT_GE(layer_fds[3], 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have succeeded"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "nooverride_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have succeeded"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have succeeded"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + +TEST_F(set_layers_via_fds, set_override_creds_invalid) +{ + int fd_context, fd_tmpfs, fd_overlay, ret; + int layer_fds[] = { [0 ... 3] = -EBADF }; + pid_t pid; + int fd_userns1, fd_userns2; + int ipc_sockets[2]; + char c; + const unsigned int predictable_fd_context_nr = 123; + + fd_userns1 = get_userns_fd(0, 0, 10000); + ASSERT_GE(fd_userns1, 0); + + fd_userns2 = get_userns_fd(0, 1234, 10000); + ASSERT_GE(fd_userns2, 0); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + ASSERT_GE(ret, 0); + + pid = create_child(&self->pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (close(ipc_sockets[0])) { + TH_LOG("close should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (!switch_userns(fd_userns2, 0, 0, false)) { + TH_LOG("switch_userns should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (read_nointr(ipc_sockets[1], &c, 1) != 1) { + TH_LOG("read_nointr should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (close(ipc_sockets[1])) { + TH_LOG("close should have succeeded"); + _exit(EXIT_FAILURE); + } + + if (!sys_fsconfig(predictable_fd_context_nr, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) { + TH_LOG("sys_fsconfig should have failed"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + ASSERT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(switch_userns(fd_userns1, 0, 0, false), true); + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + + layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY); + ASSERT_GE(layer_fds[3], 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + ASSERT_EQ(dup3(fd_context, predictable_fd_context_nr, 0), predictable_fd_context_nr); + ASSERT_EQ(close(fd_context), 0); + fd_context = predictable_fd_context_nr; + ASSERT_EQ(write_nointr(ipc_sockets[0], "1", 1), 1); + ASSERT_EQ(close(ipc_sockets[0]), 0); + + ASSERT_EQ(wait_for_pid(pid), 0); + ASSERT_EQ(close(self->pidfd), 0); + self->pidfd = -EBADF; + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) + ASSERT_EQ(close(layer_fds[i]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); + ASSERT_EQ(close(fd_userns1), 0); + ASSERT_EQ(close(fd_userns2), 0); +} + +TEST_F(set_layers_via_fds, set_override_creds_nomknod) +{ + int fd_context, fd_tmpfs, fd_overlay; + int layer_fds[] = { [0 ... 3] = -EBADF }; + pid_t pid; + int pidfd; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + + layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY); + ASSERT_GE(layer_fds[3], 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0); + + pid = create_child(&pidfd, 0); + ASSERT_GE(pid, 0); + if (pid == 0) { + if (!cap_down(CAP_MKNOD)) + _exit(EXIT_FAILURE); + + if (!cap_down(CAP_SYS_ADMIN)) + _exit(EXIT_FAILURE); + + if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); + } + ASSERT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0); + ASSERT_GE(close(pidfd), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(mknodat(fd_overlay, "dev-zero", S_IFCHR | 0644, makedev(1, 5)), -1); + ASSERT_EQ(errno, EPERM); + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + +TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds) +{ + int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower; + int layer_fds[500] = { [0 ... 499] = -EBADF }; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + char path[100]; + + sprintf(path, "l%d", i); + ASSERT_EQ(mkdirat(fd_tmpfs, path, 0755), 0); + layer_fds[i] = openat(fd_tmpfs, path, O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[i], 0); + } + + ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0); + fd_work = openat(fd_tmpfs, "w", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_work, 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + fd_upper = openat(fd_tmpfs, "u", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_upper, 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "l501", 0755), 0); + fd_lower = openat(fd_tmpfs, "l501", O_DIRECTORY | O_PATH); + ASSERT_GE(fd_lower, 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, fd_work), 0); + ASSERT_EQ(close(fd_work), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, fd_upper), 0); + ASSERT_EQ(close(fd_upper), 0); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[i]), 0); + ASSERT_EQ(close(layer_fds[i]), 0); + } + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, fd_lower), 0); + ASSERT_EQ(close(fd_lower), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); +} + +TEST_F(set_layers_via_fds, set_layers_via_detached_mount_fds) +{ + int fd_context, fd_tmpfs, fd_overlay, fd_tmp; + int layer_fds[] = { [0 ... 8] = -EBADF }; + bool layers_found[] = { [0 ... 8] = false }; + size_t len = 0; + char *line = NULL; + FILE *f_mountinfo; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0); + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + ASSERT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u/upper", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "u/work", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l3", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "l4", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d1", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d2", 0755), 0); + ASSERT_EQ(mkdirat(fd_tmpfs, "d3", 0755), 0); + + ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/set_layers_via_fds_tmpfs", MOVE_MOUNT_F_EMPTY_PATH), 0); + + fd_tmp = open_tree(fd_tmpfs, "u", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tmp, 0); + + layer_fds[0] = openat(fd_tmp, "upper", O_CLOEXEC | O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[0], 0); + + layer_fds[1] = openat(fd_tmp, "work", O_CLOEXEC | O_DIRECTORY | O_PATH); + ASSERT_GE(layer_fds[1], 0); + + layer_fds[2] = open_tree(fd_tmpfs, "l1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[2], 0); + + layer_fds[3] = open_tree(fd_tmpfs, "l2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[3], 0); + + layer_fds[4] = open_tree(fd_tmpfs, "l3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[4], 0); + + layer_fds[5] = open_tree(fd_tmpfs, "l4", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[5], 0); + + layer_fds[6] = open_tree(fd_tmpfs, "d1", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[6], 0); + + layer_fds[7] = open_tree(fd_tmpfs, "d2", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[7], 0); + + layer_fds[8] = open_tree(fd_tmpfs, "d3", OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC); + ASSERT_GE(layer_fds[8], 0); + + ASSERT_EQ(close(fd_tmpfs), 0); + + fd_context = sys_fsopen("overlay", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir", NULL, layer_fds[0]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir", NULL, layer_fds[1]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[4]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[5]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[6]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[7]), 0); + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "datadir+", NULL, layer_fds[8]), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_overlay = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_overlay, 0); + + ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0); + + f_mountinfo = fopen("/proc/self/mountinfo", "r"); + ASSERT_NE(f_mountinfo, NULL); + + while (getline(&line, &len, f_mountinfo) != -1) { + char *haystack = line; + + if (strstr(haystack, "workdir=/tmp/w")) + layers_found[0] = true; + if (strstr(haystack, "upperdir=/tmp/u")) + layers_found[1] = true; + if (strstr(haystack, "lowerdir+=/tmp/l1")) + layers_found[2] = true; + if (strstr(haystack, "lowerdir+=/tmp/l2")) + layers_found[3] = true; + if (strstr(haystack, "lowerdir+=/tmp/l3")) + layers_found[4] = true; + if (strstr(haystack, "lowerdir+=/tmp/l4")) + layers_found[5] = true; + if (strstr(haystack, "datadir+=/tmp/d1")) + layers_found[6] = true; + if (strstr(haystack, "datadir+=/tmp/d2")) + layers_found[7] = true; + if (strstr(haystack, "datadir+=/tmp/d3")) + layers_found[8] = true; + } + free(line); + + for (int i = 0; i < ARRAY_SIZE(layer_fds); i++) { + ASSERT_EQ(layers_found[i], true); + ASSERT_EQ(close(layer_fds[i]), 0); + } + + ASSERT_EQ(close(fd_context), 0); + ASSERT_EQ(close(fd_overlay), 0); + ASSERT_EQ(fclose(f_mountinfo), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/overlayfs/wrappers.h b/tools/testing/selftests/filesystems/overlayfs/wrappers.h index 071b95fd2ac0..c38bc48e0cfa 100644 --- a/tools/testing/selftests/filesystems/overlayfs/wrappers.h +++ b/tools/testing/selftests/filesystems/overlayfs/wrappers.h @@ -44,4 +44,21 @@ static inline int sys_move_mount(int from_dfd, const char *from_pathname, to_pathname, flags); } +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +#ifndef AT_RECURSIVE +#define AT_RECURSIVE 0x8000 +#endif + +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + #endif diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index f4294bab9d73..a7a5289ddae9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -25,7 +25,7 @@ static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, return syscall(__NR_statmount, &req, buf, bufsize, flags); } -static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, +static inline ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t last_mnt_id, uint64_t list[], size_t num, unsigned int flags) { diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 46d289611ce8..f048042e53e9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -26,13 +26,12 @@ static const char *const known_fs[] = { "hfsplus", "hostfs", "hpfs", "hugetlbfs", "ibmasmfs", "iomem", "ipathfs", "iso9660", "jffs2", "jfs", "minix", "mqueue", "msdos", "nfs", "nfs4", "nfsd", "nilfs2", "nsfs", "ntfs", "ntfs3", "ocfs2", - "ocfs2_dlmfs", "ocxlflash", "omfs", "openpromfs", "overlay", "pipefs", - "proc", "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", - "resctrl", "romfs", "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem", - "securityfs", "selinuxfs", "smackfs", "smb3", "sockfs", "spufs", - "squashfs", "sysfs", "sysv", "tmpfs", "tracefs", "ubifs", "udf", - "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", - "zonefs", NULL }; + "ocfs2_dlmfs", "omfs", "openpromfs", "overlay", "pipefs", "proc", + "pstore", "pvfs2", "qnx4", "qnx6", "ramfs", "resctrl", "romfs", + "rootfs", "rpc_pipefs", "s390_hypfs", "secretmem", "securityfs", + "selinuxfs", "smackfs", "smb3", "sockfs", "spufs", "squashfs", "sysfs", + "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", + "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) { diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c new file mode 100644 index 000000000000..e553c89c5b19 --- /dev/null +++ b/tools/testing/selftests/filesystems/utils.c @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <fcntl.h> +#include <sys/types.h> +#include <dirent.h> +#include <grp.h> +#include <linux/limits.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/eventfd.h> +#include <sys/fsuid.h> +#include <sys/prctl.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/xattr.h> + +#include "utils.h" + +#define MAX_USERNS_LEVEL 32 + +#define syserror(format, ...) \ + ({ \ + fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \ + (-errno); \ + }) + +#define syserror_set(__ret__, format, ...) \ + ({ \ + typeof(__ret__) __internal_ret__ = (__ret__); \ + errno = labs(__ret__); \ + fprintf(stderr, "%m - " format "\n", ##__VA_ARGS__); \ + __internal_ret__; \ + }) + +#define STRLITERALLEN(x) (sizeof(""x"") - 1) + +#define INTTYPE_TO_STRLEN(type) \ + (2 + (sizeof(type) <= 1 \ + ? 3 \ + : sizeof(type) <= 2 \ + ? 5 \ + : sizeof(type) <= 4 \ + ? 10 \ + : sizeof(type) <= 8 ? 20 : sizeof(int[-2 * (sizeof(type) > 8)]))) + +#define list_for_each(__iterator, __list) \ + for (__iterator = (__list)->next; __iterator != __list; __iterator = __iterator->next) + +typedef enum idmap_type_t { + ID_TYPE_UID, + ID_TYPE_GID +} idmap_type_t; + +struct id_map { + idmap_type_t map_type; + __u32 nsid; + __u32 hostid; + __u32 range; +}; + +struct list { + void *elem; + struct list *next; + struct list *prev; +}; + +struct userns_hierarchy { + int fd_userns; + int fd_event; + unsigned int level; + struct list id_map; +}; + +static inline void list_init(struct list *list) +{ + list->elem = NULL; + list->next = list->prev = list; +} + +static inline int list_empty(const struct list *list) +{ + return list == list->next; +} + +static inline void __list_add(struct list *new, struct list *prev, struct list *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +static inline void list_add_tail(struct list *head, struct list *list) +{ + __list_add(list, head->prev, head); +} + +static inline void list_del(struct list *list) +{ + struct list *next, *prev; + + next = list->next; + prev = list->prev; + next->prev = prev; + prev->next = next; +} + +static ssize_t read_nointr(int fd, void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = read(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +static ssize_t write_nointr(int fd, const void *buf, size_t count) +{ + ssize_t ret; + + do { + ret = write(fd, buf, count); + } while (ret < 0 && errno == EINTR); + + return ret; +} + +#define __STACK_SIZE (8 * 1024 * 1024) +static pid_t do_clone(int (*fn)(void *), void *arg, int flags) +{ + void *stack; + + stack = malloc(__STACK_SIZE); + if (!stack) + return -ENOMEM; + +#ifdef __ia64__ + return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#else + return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL); +#endif +} + +static int get_userns_fd_cb(void *data) +{ + for (;;) + pause(); + _exit(0); +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static int write_id_mapping(idmap_type_t map_type, pid_t pid, const char *buf, size_t buf_size) +{ + int fd = -EBADF, setgroups_fd = -EBADF; + int fret = -1; + int ret; + char path[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) + + STRLITERALLEN("/setgroups") + 1]; + + if (geteuid() != 0 && map_type == ID_TYPE_GID) { + ret = snprintf(path, sizeof(path), "/proc/%d/setgroups", pid); + if (ret < 0 || ret >= sizeof(path)) + goto out; + + setgroups_fd = open(path, O_WRONLY | O_CLOEXEC); + if (setgroups_fd < 0 && errno != ENOENT) { + syserror("Failed to open \"%s\"", path); + goto out; + } + + if (setgroups_fd >= 0) { + ret = write_nointr(setgroups_fd, "deny\n", STRLITERALLEN("deny\n")); + if (ret != STRLITERALLEN("deny\n")) { + syserror("Failed to write \"deny\" to \"/proc/%d/setgroups\"", pid); + goto out; + } + } + } + + ret = snprintf(path, sizeof(path), "/proc/%d/%cid_map", pid, map_type == ID_TYPE_UID ? 'u' : 'g'); + if (ret < 0 || ret >= sizeof(path)) + goto out; + + fd = open(path, O_WRONLY | O_CLOEXEC); + if (fd < 0) { + syserror("Failed to open \"%s\"", path); + goto out; + } + + ret = write_nointr(fd, buf, buf_size); + if (ret != buf_size) { + syserror("Failed to write %cid mapping to \"%s\"", + map_type == ID_TYPE_UID ? 'u' : 'g', path); + goto out; + } + + fret = 0; +out: + close(fd); + close(setgroups_fd); + + return fret; +} + +static int map_ids_from_idmap(struct list *idmap, pid_t pid) +{ + int fill, left; + char mapbuf[4096] = {}; + bool had_entry = false; + idmap_type_t map_type, u_or_g; + + if (list_empty(idmap)) + return 0; + + for (map_type = ID_TYPE_UID, u_or_g = 'u'; + map_type <= ID_TYPE_GID; map_type++, u_or_g = 'g') { + char *pos = mapbuf; + int ret; + struct list *iterator; + + + list_for_each(iterator, idmap) { + struct id_map *map = iterator->elem; + if (map->map_type != map_type) + continue; + + had_entry = true; + + left = 4096 - (pos - mapbuf); + fill = snprintf(pos, left, "%u %u %u\n", map->nsid, map->hostid, map->range); + /* + * The kernel only takes <= 4k for writes to + * /proc/<pid>/{g,u}id_map + */ + if (fill <= 0 || fill >= left) + return syserror_set(-E2BIG, "Too many %cid mappings defined", u_or_g); + + pos += fill; + } + if (!had_entry) + continue; + + ret = write_id_mapping(map_type, pid, mapbuf, pos - mapbuf); + if (ret < 0) + return syserror("Failed to write mapping: %s", mapbuf); + + memset(mapbuf, 0, sizeof(mapbuf)); + } + + return 0; +} + +static int get_userns_fd_from_idmap(struct list *idmap) +{ + int ret; + pid_t pid; + char path_ns[STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(pid_t) + + STRLITERALLEN("/ns/user") + 1]; + + pid = do_clone(get_userns_fd_cb, NULL, CLONE_NEWUSER | CLONE_NEWNS); + if (pid < 0) + return -errno; + + ret = map_ids_from_idmap(idmap, pid); + if (ret < 0) + return ret; + + ret = snprintf(path_ns, sizeof(path_ns), "/proc/%d/ns/user", pid); + if (ret < 0 || (size_t)ret >= sizeof(path_ns)) + ret = -EIO; + else + ret = open(path_ns, O_RDONLY | O_CLOEXEC | O_NOCTTY); + + (void)kill(pid, SIGKILL); + (void)wait_for_pid(pid); + return ret; +} + +int get_userns_fd(unsigned long nsid, unsigned long hostid, unsigned long range) +{ + struct list head, uid_mapl, gid_mapl; + struct id_map uid_map = { + .map_type = ID_TYPE_UID, + .nsid = nsid, + .hostid = hostid, + .range = range, + }; + struct id_map gid_map = { + .map_type = ID_TYPE_GID, + .nsid = nsid, + .hostid = hostid, + .range = range, + }; + + list_init(&head); + uid_mapl.elem = &uid_map; + gid_mapl.elem = &gid_map; + list_add_tail(&head, &uid_mapl); + list_add_tail(&head, &gid_mapl); + + return get_userns_fd_from_idmap(&head); +} + +bool switch_ids(uid_t uid, gid_t gid) +{ + if (setgroups(0, NULL)) + return syserror("failure: setgroups"); + + if (setresgid(gid, gid, gid)) + return syserror("failure: setresgid"); + + if (setresuid(uid, uid, uid)) + return syserror("failure: setresuid"); + + /* Ensure we can access proc files from processes we can ptrace. */ + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0)) + return syserror("failure: make dumpable"); + + return true; +} + +static int create_userns_hierarchy(struct userns_hierarchy *h); + +static int userns_fd_cb(void *data) +{ + struct userns_hierarchy *h = data; + char c; + int ret; + + ret = read_nointr(h->fd_event, &c, 1); + if (ret < 0) + return syserror("failure: read from socketpair"); + + /* Only switch ids if someone actually wrote a mapping for us. */ + if (c == '1') { + if (!switch_ids(0, 0)) + return syserror("failure: switch ids to 0"); + } + + ret = write_nointr(h->fd_event, "1", 1); + if (ret < 0) + return syserror("failure: write to socketpair"); + + ret = create_userns_hierarchy(++h); + if (ret < 0) + return syserror("failure: userns level %d", h->level); + + return 0; +} + +static int create_userns_hierarchy(struct userns_hierarchy *h) +{ + int fret = -1; + char c; + int fd_socket[2]; + int fd_userns = -EBADF, ret = -1; + ssize_t bytes; + pid_t pid; + char path[256]; + + if (h->level == MAX_USERNS_LEVEL) + return 0; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, fd_socket); + if (ret < 0) + return syserror("failure: create socketpair"); + + /* Note the CLONE_FILES | CLONE_VM when mucking with fds and memory. */ + h->fd_event = fd_socket[1]; + pid = do_clone(userns_fd_cb, h, CLONE_NEWUSER | CLONE_FILES | CLONE_VM); + if (pid < 0) { + syserror("failure: userns level %d", h->level); + goto out_close; + } + + ret = map_ids_from_idmap(&h->id_map, pid); + if (ret < 0) { + kill(pid, SIGKILL); + syserror("failure: writing id mapping for userns level %d for %d", h->level, pid); + goto out_wait; + } + + if (!list_empty(&h->id_map)) + bytes = write_nointr(fd_socket[0], "1", 1); /* Inform the child we wrote a mapping. */ + else + bytes = write_nointr(fd_socket[0], "0", 1); /* Inform the child we didn't write a mapping. */ + if (bytes < 0) { + kill(pid, SIGKILL); + syserror("failure: write to socketpair"); + goto out_wait; + } + + /* Wait for child to set*id() and become dumpable. */ + bytes = read_nointr(fd_socket[0], &c, 1); + if (bytes < 0) { + kill(pid, SIGKILL); + syserror("failure: read from socketpair"); + goto out_wait; + } + + snprintf(path, sizeof(path), "/proc/%d/ns/user", pid); + fd_userns = open(path, O_RDONLY | O_CLOEXEC); + if (fd_userns < 0) { + kill(pid, SIGKILL); + syserror("failure: open userns level %d for %d", h->level, pid); + goto out_wait; + } + + fret = 0; + +out_wait: + if (!wait_for_pid(pid) && !fret) { + h->fd_userns = fd_userns; + fd_userns = -EBADF; + } + +out_close: + if (fd_userns >= 0) + close(fd_userns); + close(fd_socket[0]); + close(fd_socket[1]); + return fret; +} + +/* caps_down - lower all effective caps */ +int caps_down(void) +{ + bool fret = false; + cap_t caps = NULL; + int ret = -1; + + caps = cap_get_proc(); + if (!caps) + goto out; + + ret = cap_clear_flag(caps, CAP_EFFECTIVE); + if (ret) + goto out; + + ret = cap_set_proc(caps); + if (ret) + goto out; + + fret = true; + +out: + cap_free(caps); + return fret; +} + +/* cap_down - lower an effective cap */ +int cap_down(cap_value_t down) +{ + bool fret = false; + cap_t caps = NULL; + cap_value_t cap = down; + int ret = -1; + + caps = cap_get_proc(); + if (!caps) + goto out; + + ret = cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap, 0); + if (ret) + goto out; + + ret = cap_set_proc(caps); + if (ret) + goto out; + + fret = true; + +out: + cap_free(caps); + return fret; +} diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h new file mode 100644 index 000000000000..7f1df2a3e94c --- /dev/null +++ b/tools/testing/selftests/filesystems/utils.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __IDMAP_UTILS_H +#define __IDMAP_UTILS_H + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <errno.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/capability.h> +#include <sys/fsuid.h> +#include <sys/types.h> +#include <unistd.h> + +extern int get_userns_fd(unsigned long nsid, unsigned long hostid, + unsigned long range); + +extern int caps_down(void); +extern int cap_down(cap_value_t down); + +extern bool switch_ids(uid_t uid, gid_t gid); + +static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) +{ + if (setns(fd, CLONE_NEWUSER)) + return false; + + if (!switch_ids(uid, gid)) + return false; + + if (drop_caps && !caps_down()) + return false; + + return true; +} + +#endif /* __IDMAP_UTILS_H */ diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index cdf91b0ca40f..c3b6d2604b1e 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -444,10 +444,6 @@ static inline __noreturn __printf(1, 2) void ksft_exit_skip(const char *msg, ... static inline int ksft_min_kernel_version(unsigned int min_major, unsigned int min_minor) { -#ifdef NOLIBC - ksft_print_msg("NOLIBC: Can't check kernel version: Function not implemented\n"); - return 0; -#else unsigned int major, minor; struct utsname info; @@ -455,7 +451,6 @@ static inline int ksft_min_kernel_version(unsigned int min_major, ksft_exit_fail_msg("Can't parse kernel version\n"); return major > min_major || (major == min_major && minor >= min_minor); -#endif } #endif /* __KSELFTEST_H */ diff --git a/tools/testing/selftests/kselftest/module.sh b/tools/testing/selftests/kselftest/module.sh index fb4733faff12..51fb65159932 100755 --- a/tools/testing/selftests/kselftest/module.sh +++ b/tools/testing/selftests/kselftest/module.sh @@ -11,7 +11,7 @@ # SPDX-License-Identifier: GPL-2.0+ # $(dirname $0)/../kselftest/module.sh "description" module_name # -# Example: tools/testing/selftests/lib/printf.sh +# Example: tools/testing/selftests/lib/bitmap.sh desc="" # Output prefix. module="" # Filename (without the .ko). diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 4277b983cace..f773f8f99249 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -69,6 +69,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test +TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index 3c7defd34f56..447e619cf856 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -239,7 +239,7 @@ static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args) case ITERATION_MARK_IDLE: mark_vcpu_memory_idle(vm, vcpu_args); break; - }; + } vcpu_last_completed_iteration[vcpu_idx] = current_iteration; } diff --git a/tools/testing/selftests/kvm/arm64/get-reg-list.c b/tools/testing/selftests/kvm/arm64/get-reg-list.c index d43fb3f49050..d01798b6b3b4 100644 --- a/tools/testing/selftests/kvm/arm64/get-reg-list.c +++ b/tools/testing/selftests/kvm/arm64/get-reg-list.c @@ -332,6 +332,7 @@ static __u64 base_regs[] = { KVM_REG_ARM_FW_FEAT_BMAP_REG(0), /* KVM_REG_ARM_STD_BMAP */ KVM_REG_ARM_FW_FEAT_BMAP_REG(1), /* KVM_REG_ARM_STD_HYP_BMAP */ KVM_REG_ARM_FW_FEAT_BMAP_REG(2), /* KVM_REG_ARM_VENDOR_HYP_BMAP */ + KVM_REG_ARM_FW_FEAT_BMAP_REG(3), /* KVM_REG_ARM_VENDOR_HYP_BMAP_2 */ ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */ ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */ ARM64_SYS_REG(3, 3, 14, 0, 2), diff --git a/tools/testing/selftests/kvm/arm64/hypercalls.c b/tools/testing/selftests/kvm/arm64/hypercalls.c index ec54ec7726e9..44cfcf8a7f46 100644 --- a/tools/testing/selftests/kvm/arm64/hypercalls.c +++ b/tools/testing/selftests/kvm/arm64/hypercalls.c @@ -21,22 +21,31 @@ #define KVM_REG_ARM_STD_BMAP_BIT_MAX 0 #define KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX 0 #define KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX 1 +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_MAX 1 + +#define KVM_REG_ARM_STD_BMAP_RESET_VAL FW_REG_ULIMIT_VAL(KVM_REG_ARM_STD_BMAP_BIT_MAX) +#define KVM_REG_ARM_STD_HYP_BMAP_RESET_VAL FW_REG_ULIMIT_VAL(KVM_REG_ARM_STD_HYP_BMAP_BIT_MAX) +#define KVM_REG_ARM_VENDOR_HYP_BMAP_RESET_VAL FW_REG_ULIMIT_VAL(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_MAX) +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2_RESET_VAL 0 struct kvm_fw_reg_info { uint64_t reg; /* Register definition */ uint64_t max_feat_bit; /* Bit that represents the upper limit of the feature-map */ + uint64_t reset_val; /* Reset value for the register */ }; #define FW_REG_INFO(r) \ { \ .reg = r, \ .max_feat_bit = r##_BIT_MAX, \ + .reset_val = r##_RESET_VAL \ } static const struct kvm_fw_reg_info fw_reg_info[] = { FW_REG_INFO(KVM_REG_ARM_STD_BMAP), FW_REG_INFO(KVM_REG_ARM_STD_HYP_BMAP), FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP), + FW_REG_INFO(KVM_REG_ARM_VENDOR_HYP_BMAP_2), }; enum test_stage { @@ -171,22 +180,39 @@ static void test_fw_regs_before_vm_start(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(fw_reg_info); i++) { const struct kvm_fw_reg_info *reg_info = &fw_reg_info[i]; + uint64_t set_val; - /* First 'read' should be an upper limit of the features supported */ + /* First 'read' should be the reset value for the reg */ val = vcpu_get_reg(vcpu, reg_info->reg); - TEST_ASSERT(val == FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), - "Expected all the features to be set for reg: 0x%lx; expected: 0x%lx; read: 0x%lx", - reg_info->reg, FW_REG_ULIMIT_VAL(reg_info->max_feat_bit), val); + TEST_ASSERT(val == reg_info->reset_val, + "Unexpected reset value for reg: 0x%lx; expected: 0x%lx; read: 0x%lx", + reg_info->reg, reg_info->reset_val, val); + + if (reg_info->reset_val) + set_val = 0; + else + set_val = FW_REG_ULIMIT_VAL(reg_info->max_feat_bit); - /* Test a 'write' by disabling all the features of the register map */ - ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); + ret = __vcpu_set_reg(vcpu, reg_info->reg, set_val); TEST_ASSERT(ret == 0, - "Failed to clear all the features of reg: 0x%lx; ret: %d", - reg_info->reg, errno); + "Failed to %s all the features of reg: 0x%lx; ret: %d", + (set_val ? "set" : "clear"), reg_info->reg, errno); val = vcpu_get_reg(vcpu, reg_info->reg); - TEST_ASSERT(val == 0, - "Expected all the features to be cleared for reg: 0x%lx", reg_info->reg); + TEST_ASSERT(val == set_val, + "Expected all the features to be %s for reg: 0x%lx", + (set_val ? "set" : "cleared"), reg_info->reg); + + /* + * If the reg has been set, clear it as test_fw_regs_after_vm_start() + * expects it to be cleared. + */ + if (set_val) { + ret = __vcpu_set_reg(vcpu, reg_info->reg, 0); + TEST_ASSERT(ret == 0, + "Failed to clear all the features of reg: 0x%lx; ret: %d", + reg_info->reg, errno); + } /* * Test enabling a feature that's not supported. diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c index 217541fe6536..322b9d3b0125 100644 --- a/tools/testing/selftests/kvm/arm64/set_id_regs.c +++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c @@ -146,6 +146,9 @@ static const struct reg_ftr_bits ftr_id_aa64pfr1_el1[] = { static const struct reg_ftr_bits ftr_id_aa64mmfr0_el1[] = { REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, ECV, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, EXS, 0), + REG_FTR_BITS(FTR_EXACT, ID_AA64MMFR0_EL1, TGRAN4_2, 1), + REG_FTR_BITS(FTR_EXACT, ID_AA64MMFR0_EL1, TGRAN64_2, 1), + REG_FTR_BITS(FTR_EXACT, ID_AA64MMFR0_EL1, TGRAN16_2, 1), S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN4, 0), S_REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN64, 0), REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64MMFR0_EL1, TGRAN16, 0), @@ -230,6 +233,9 @@ static void guest_code(void) GUEST_REG_SYNC(SYS_ID_AA64MMFR2_EL1); GUEST_REG_SYNC(SYS_ID_AA64ZFR0_EL1); GUEST_REG_SYNC(SYS_CTR_EL0); + GUEST_REG_SYNC(SYS_MIDR_EL1); + GUEST_REG_SYNC(SYS_REVIDR_EL1); + GUEST_REG_SYNC(SYS_AIDR_EL1); GUEST_DONE(); } @@ -609,18 +615,31 @@ static void test_ctr(struct kvm_vcpu *vcpu) test_reg_vals[encoding_to_range_idx(SYS_CTR_EL0)] = ctr; } -static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +static void test_id_reg(struct kvm_vcpu *vcpu, u32 id) { u64 val; + val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(id)); + val++; + vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(id), val); + test_reg_vals[encoding_to_range_idx(id)] = val; +} + +static void test_vcpu_ftr_id_regs(struct kvm_vcpu *vcpu) +{ test_clidr(vcpu); test_ctr(vcpu); - val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1)); - val++; - vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_MPIDR_EL1), val); + test_id_reg(vcpu, SYS_MPIDR_EL1); + ksft_test_result_pass("%s\n", __func__); +} + +static void test_vcpu_non_ftr_id_regs(struct kvm_vcpu *vcpu) +{ + test_id_reg(vcpu, SYS_MIDR_EL1); + test_id_reg(vcpu, SYS_REVIDR_EL1); + test_id_reg(vcpu, SYS_AIDR_EL1); - test_reg_vals[encoding_to_range_idx(SYS_MPIDR_EL1)] = val; ksft_test_result_pass("%s\n", __func__); } @@ -647,6 +666,9 @@ static void test_reset_preserves_id_regs(struct kvm_vcpu *vcpu) test_assert_id_reg_unchanged(vcpu, SYS_MPIDR_EL1); test_assert_id_reg_unchanged(vcpu, SYS_CLIDR_EL1); test_assert_id_reg_unchanged(vcpu, SYS_CTR_EL0); + test_assert_id_reg_unchanged(vcpu, SYS_MIDR_EL1); + test_assert_id_reg_unchanged(vcpu, SYS_REVIDR_EL1); + test_assert_id_reg_unchanged(vcpu, SYS_AIDR_EL1); ksft_test_result_pass("%s\n", __func__); } @@ -660,8 +682,11 @@ int main(void) int test_cnt; TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_WRITABLE_IMP_ID_REGS)); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm = vm_create(1); + vm_enable_cap(vm, KVM_CAP_ARM_WRITABLE_IMP_ID_REGS, 0); + vcpu = vm_vcpu_add(vm, 0, guest_code); /* Check for AARCH64 only system */ val = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1)); @@ -675,13 +700,14 @@ int main(void) ARRAY_SIZE(ftr_id_aa64isar2_el1) + ARRAY_SIZE(ftr_id_aa64pfr0_el1) + ARRAY_SIZE(ftr_id_aa64pfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr0_el1) + ARRAY_SIZE(ftr_id_aa64mmfr1_el1) + ARRAY_SIZE(ftr_id_aa64mmfr2_el1) + - ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 2 + + ARRAY_SIZE(ftr_id_aa64zfr0_el1) - ARRAY_SIZE(test_regs) + 3 + MPAM_IDREG_TEST; ksft_set_plan(test_cnt); test_vm_ftr_id_regs(vcpu, aarch64_only); test_vcpu_ftr_id_regs(vcpu); + test_vcpu_non_ftr_id_regs(vcpu); test_user_set_mpam_reg(vcpu); test_guest_reg_read(vcpu); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index aacf80f57439..23593d9eeba9 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -31,15 +31,18 @@ /* Default guest test virtual memory offset */ #define DEFAULT_GUEST_TEST_MEM 0xc0000000 -/* How many pages to dirty for each guest loop */ -#define TEST_PAGES_PER_LOOP 1024 - /* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */ #define TEST_HOST_LOOP_N 32UL /* Interval for each host loop (ms) */ #define TEST_HOST_LOOP_INTERVAL 10UL +/* + * Ensure the vCPU is able to perform a reasonable number of writes in each + * iteration to provide a lower bound on coverage. + */ +#define TEST_MIN_WRITES_PER_ITERATION 0x100 + /* Dirty bitmaps are always little endian, so we need to swap on big endian */ #if defined(__s390x__) # define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) @@ -75,6 +78,8 @@ static uint64_t host_page_size; static uint64_t guest_page_size; static uint64_t guest_num_pages; static uint64_t iteration; +static uint64_t nr_writes; +static bool vcpu_stop; /* * Guest physical memory offset of the testing memory slot. @@ -96,7 +101,9 @@ static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM; static void guest_code(void) { uint64_t addr; - int i; + +#ifdef __s390x__ + uint64_t i; /* * On s390x, all pages of a 1M segment are initially marked as dirty @@ -107,16 +114,19 @@ static void guest_code(void) for (i = 0; i < guest_num_pages; i++) { addr = guest_test_virt_mem + i * guest_page_size; vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); + nr_writes++; } +#endif while (true) { - for (i = 0; i < TEST_PAGES_PER_LOOP; i++) { + while (!READ_ONCE(vcpu_stop)) { addr = guest_test_virt_mem; addr += (guest_random_u64(&guest_rng) % guest_num_pages) * guest_page_size; addr = align_down(addr, host_page_size); vcpu_arch_put_guest(*(uint64_t *)addr, READ_ONCE(iteration)); + nr_writes++; } GUEST_SYNC(1); @@ -133,25 +143,18 @@ static uint64_t host_num_pages; /* For statistics only */ static uint64_t host_dirty_count; static uint64_t host_clear_count; -static uint64_t host_track_next_count; /* Whether dirty ring reset is requested, or finished */ static sem_t sem_vcpu_stop; static sem_t sem_vcpu_cont; -/* - * This is only set by main thread, and only cleared by vcpu thread. It is - * used to request vcpu thread to stop at the next GUEST_SYNC, since GUEST_SYNC - * is the only place that we'll guarantee both "dirty bit" and "dirty data" - * will match. E.g., SIG_IPI won't guarantee that if the vcpu is interrupted - * after setting dirty bit but before the data is written. - */ -static atomic_t vcpu_sync_stop_requested; + /* * This is updated by the vcpu thread to tell the host whether it's a * ring-full event. It should only be read until a sem_wait() of * sem_vcpu_stop and before vcpu continues to run. */ static bool dirty_ring_vcpu_ring_full; + /* * This is only used for verifying the dirty pages. Dirty ring has a very * tricky case when the ring just got full, kvm will do userspace exit due to @@ -166,7 +169,51 @@ static bool dirty_ring_vcpu_ring_full; * dirty gfn we've collected, so that if a mismatch of data found later in the * verifying process, we let it pass. */ -static uint64_t dirty_ring_last_page; +static uint64_t dirty_ring_last_page = -1ULL; + +/* + * In addition to the above, it is possible (especially if this + * test is run nested) for the above scenario to repeat multiple times: + * + * The following can happen: + * + * - L1 vCPU: Memory write is logged to PML but not committed. + * + * - L1 test thread: Ignores the write because its last dirty ring entry + * Resets the dirty ring which: + * - Resets the A/D bits in EPT + * - Issues tlb flush (invept), which is intercepted by L0 + * + * - L0: frees the whole nested ept mmu root as the response to invept, + * and thus ensures that when memory write is retried, it will fault again + * + * - L1 vCPU: Same memory write is logged to the PML but not committed again. + * + * - L1 test thread: Ignores the write because its last dirty ring entry (again) + * Resets the dirty ring which: + * - Resets the A/D bits in EPT (again) + * - Issues tlb flush (again) which is intercepted by L0 + * + * ... + * + * N times + * + * - L1 vCPU: Memory write is logged in the PML and then committed. + * Lots of other memory writes are logged and committed. + * ... + * + * - L1 test thread: Sees the memory write along with other memory writes + * in the dirty ring, and since the write is usually not + * the last entry in the dirty-ring and has a very outdated + * iteration, the test fails. + * + * + * Note that this is only possible when the write was the last log entry + * write during iteration N-1, thus remember last iteration last log entry + * and also don't fail when it is reported in the next iteration, together with + * an outdated iteration count. + */ +static uint64_t dirty_ring_prev_iteration_last_page; enum log_mode_t { /* Only use KVM_GET_DIRTY_LOG for logging */ @@ -191,24 +238,6 @@ static enum log_mode_t host_log_mode; static pthread_t vcpu_thread; static uint32_t test_dirty_ring_count = TEST_DIRTY_RING_COUNT; -static void vcpu_kick(void) -{ - pthread_kill(vcpu_thread, SIG_IPI); -} - -/* - * In our test we do signal tricks, let's use a better version of - * sem_wait to avoid signal interrupts - */ -static void sem_wait_until(sem_t *sem) -{ - int ret; - - do - ret = sem_wait(sem); - while (ret == -1 && errno == EINTR); -} - static bool clear_log_supported(void) { return kvm_has_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); @@ -243,21 +272,16 @@ static void clear_log_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, /* Should only be called after a GUEST_SYNC */ static void vcpu_handle_sync_stop(void) { - if (atomic_read(&vcpu_sync_stop_requested)) { - /* It means main thread is sleeping waiting */ - atomic_set(&vcpu_sync_stop_requested, false); + if (READ_ONCE(vcpu_stop)) { sem_post(&sem_vcpu_stop); - sem_wait_until(&sem_vcpu_cont); + sem_wait(&sem_vcpu_cont); } } -static void default_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) +static void default_after_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; - TEST_ASSERT(ret == 0 || (ret == -1 && err == EINTR), - "vcpu run failed: errno=%d", err); - TEST_ASSERT(get_ucall(vcpu, NULL) == UCALL_SYNC, "Invalid guest sync status: exit_reason=%s", exit_reason_str(run->exit_reason)); @@ -324,7 +348,6 @@ static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, "%u != %u", cur->slot, slot); TEST_ASSERT(cur->offset < num_pages, "Offset overflow: " "0x%llx >= 0x%x", cur->offset, num_pages); - //pr_info("fetch 0x%x page %llu\n", *fetch_index, cur->offset); __set_bit_le(cur->offset, bitmap); dirty_ring_last_page = cur->offset; dirty_gfn_set_collected(cur); @@ -335,36 +358,11 @@ static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns, return count; } -static void dirty_ring_wait_vcpu(void) -{ - /* This makes sure that hardware PML cache flushed */ - vcpu_kick(); - sem_wait_until(&sem_vcpu_stop); -} - -static void dirty_ring_continue_vcpu(void) -{ - pr_info("Notifying vcpu to continue\n"); - sem_post(&sem_vcpu_cont); -} - static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, void *bitmap, uint32_t num_pages, uint32_t *ring_buf_idx) { - uint32_t count = 0, cleared; - bool continued_vcpu = false; - - dirty_ring_wait_vcpu(); - - if (!dirty_ring_vcpu_ring_full) { - /* - * This is not a ring-full event, it's safe to allow - * vcpu to continue - */ - dirty_ring_continue_vcpu(); - continued_vcpu = true; - } + uint32_t count, cleared; /* Only have one vcpu */ count = dirty_ring_collect_one(vcpu_map_dirty_ring(vcpu), @@ -379,35 +377,18 @@ static void dirty_ring_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, */ TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch " "with collected (%u)", cleared, count); - - if (!continued_vcpu) { - TEST_ASSERT(dirty_ring_vcpu_ring_full, - "Didn't continue vcpu even without ring full"); - dirty_ring_continue_vcpu(); - } - - pr_info("Iteration %ld collected %u pages\n", iteration, count); } -static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) +static void dirty_ring_after_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; /* A ucall-sync or ring-full event is allowed */ if (get_ucall(vcpu, NULL) == UCALL_SYNC) { - /* We should allow this to continue */ - ; - } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL || - (ret == -1 && err == EINTR)) { - /* Update the flag first before pause */ - WRITE_ONCE(dirty_ring_vcpu_ring_full, - run->exit_reason == KVM_EXIT_DIRTY_RING_FULL); - sem_post(&sem_vcpu_stop); - pr_info("vcpu stops because %s...\n", - dirty_ring_vcpu_ring_full ? - "dirty ring is full" : "vcpu is kicked out"); - sem_wait_until(&sem_vcpu_cont); - pr_info("vcpu continues now.\n"); + vcpu_handle_sync_stop(); + } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL) { + WRITE_ONCE(dirty_ring_vcpu_ring_full, true); + vcpu_handle_sync_stop(); } else { TEST_ASSERT(false, "Invalid guest sync status: " "exit_reason=%s", @@ -426,7 +407,7 @@ struct log_mode { void *bitmap, uint32_t num_pages, uint32_t *ring_buf_idx); /* Hook to call when after each vcpu run */ - void (*after_vcpu_run)(struct kvm_vcpu *vcpu, int ret, int err); + void (*after_vcpu_run)(struct kvm_vcpu *vcpu); } log_modes[LOG_MODE_NUM] = { { .name = "dirty-log", @@ -449,15 +430,6 @@ struct log_mode { }, }; -/* - * We use this bitmap to track some pages that should have its dirty - * bit set in the _next_ iteration. For example, if we detected the - * page value changed to current iteration but at the same time the - * page bit is cleared in the latest bitmap, then the system must - * report that write in the next get dirty log call. - */ -static unsigned long *host_bmap_track; - static void log_modes_dump(void) { int i; @@ -497,170 +469,109 @@ static void log_mode_collect_dirty_pages(struct kvm_vcpu *vcpu, int slot, mode->collect_dirty_pages(vcpu, slot, bitmap, num_pages, ring_buf_idx); } -static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu, int ret, int err) +static void log_mode_after_vcpu_run(struct kvm_vcpu *vcpu) { struct log_mode *mode = &log_modes[host_log_mode]; if (mode->after_vcpu_run) - mode->after_vcpu_run(vcpu, ret, err); + mode->after_vcpu_run(vcpu); } static void *vcpu_worker(void *data) { - int ret; struct kvm_vcpu *vcpu = data; - uint64_t pages_count = 0; - struct kvm_signal_mask *sigmask = alloca(offsetof(struct kvm_signal_mask, sigset) - + sizeof(sigset_t)); - sigset_t *sigset = (sigset_t *) &sigmask->sigset; - /* - * SIG_IPI is unblocked atomically while in KVM_RUN. It causes the - * ioctl to return with -EINTR, but it is still pending and we need - * to accept it with the sigwait. - */ - sigmask->len = 8; - pthread_sigmask(0, NULL, sigset); - sigdelset(sigset, SIG_IPI); - vcpu_ioctl(vcpu, KVM_SET_SIGNAL_MASK, sigmask); - - sigemptyset(sigset); - sigaddset(sigset, SIG_IPI); + sem_wait(&sem_vcpu_cont); while (!READ_ONCE(host_quit)) { - /* Clear any existing kick signals */ - pages_count += TEST_PAGES_PER_LOOP; /* Let the guest dirty the random pages */ - ret = __vcpu_run(vcpu); - if (ret == -1 && errno == EINTR) { - int sig = -1; - sigwait(sigset, &sig); - assert(sig == SIG_IPI); - } - log_mode_after_vcpu_run(vcpu, ret, errno); + vcpu_run(vcpu); + log_mode_after_vcpu_run(vcpu); } - pr_info("Dirtied %"PRIu64" pages\n", pages_count); - return NULL; } -static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap) +static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long **bmap) { + uint64_t page, nr_dirty_pages = 0, nr_clean_pages = 0; uint64_t step = vm_num_host_pages(mode, 1); - uint64_t page; - uint64_t *value_ptr; - uint64_t min_iter = 0; for (page = 0; page < host_num_pages; page += step) { - value_ptr = host_test_mem + page * host_page_size; - - /* If this is a special page that we were tracking... */ - if (__test_and_clear_bit_le(page, host_bmap_track)) { - host_track_next_count++; - TEST_ASSERT(test_bit_le(page, bmap), - "Page %"PRIu64" should have its dirty bit " - "set in this iteration but it is missing", - page); - } + uint64_t val = *(uint64_t *)(host_test_mem + page * host_page_size); + bool bmap0_dirty = __test_and_clear_bit_le(page, bmap[0]); - if (__test_and_clear_bit_le(page, bmap)) { - bool matched; - - host_dirty_count++; + /* + * Ensure both bitmaps are cleared, as a page can be written + * multiple times per iteration, i.e. can show up in both + * bitmaps, and the dirty ring is additive, i.e. doesn't purge + * bitmap entries from previous collections. + */ + if (__test_and_clear_bit_le(page, bmap[1]) || bmap0_dirty) { + nr_dirty_pages++; /* - * If the bit is set, the value written onto - * the corresponding page should be either the - * previous iteration number or the current one. + * If the page is dirty, the value written to memory + * should be the current iteration number. */ - matched = (*value_ptr == iteration || - *value_ptr == iteration - 1); - - if (host_log_mode == LOG_MODE_DIRTY_RING && !matched) { - if (*value_ptr == iteration - 2 && min_iter <= iteration - 2) { - /* - * Short answer: this case is special - * only for dirty ring test where the - * page is the last page before a kvm - * dirty ring full in iteration N-2. - * - * Long answer: Assuming ring size R, - * one possible condition is: - * - * main thr vcpu thr - * -------- -------- - * iter=1 - * write 1 to page 0~(R-1) - * full, vmexit - * collect 0~(R-1) - * kick vcpu - * write 1 to (R-1)~(2R-2) - * full, vmexit - * iter=2 - * collect (R-1)~(2R-2) - * kick vcpu - * write 1 to (2R-2) - * (NOTE!!! "1" cached in cpu reg) - * write 2 to (2R-1)~(3R-3) - * full, vmexit - * iter=3 - * collect (2R-2)~(3R-3) - * (here if we read value on page - * "2R-2" is 1, while iter=3!!!) - * - * This however can only happen once per iteration. - */ - min_iter = iteration - 1; + if (val == iteration) + continue; + + if (host_log_mode == LOG_MODE_DIRTY_RING) { + /* + * The last page in the ring from previous + * iteration can be written with the value + * from the previous iteration, as the value to + * be written may be cached in a CPU register. + */ + if (page == dirty_ring_prev_iteration_last_page && + val == iteration - 1) continue; - } else if (page == dirty_ring_last_page) { - /* - * Please refer to comments in - * dirty_ring_last_page. - */ + + /* + * Any value from a previous iteration is legal + * for the last entry, as the write may not yet + * have retired, i.e. the page may hold whatever + * it had before this iteration started. + */ + if (page == dirty_ring_last_page && + val < iteration) continue; - } + } else if (!val && iteration == 1 && bmap0_dirty) { + /* + * When testing get+clear, the dirty bitmap + * starts with all bits set, and so the first + * iteration can observe a "dirty" page that + * was never written, but only in the first + * bitmap (collecting the bitmap also clears + * all dirty pages). + */ + continue; } - TEST_ASSERT(matched, - "Set page %"PRIu64" value %"PRIu64 - " incorrect (iteration=%"PRIu64")", - page, *value_ptr, iteration); + TEST_FAIL("Dirty page %lu value (%lu) != iteration (%lu) " + "(last = %lu, prev_last = %lu)", + page, val, iteration, dirty_ring_last_page, + dirty_ring_prev_iteration_last_page); } else { - host_clear_count++; + nr_clean_pages++; /* * If cleared, the value written can be any - * value smaller or equals to the iteration - * number. Note that the value can be exactly - * (iteration-1) if that write can happen - * like this: - * - * (1) increase loop count to "iteration-1" - * (2) write to page P happens (with value - * "iteration-1") - * (3) get dirty log for "iteration-1"; we'll - * see that page P bit is set (dirtied), - * and not set the bit in host_bmap_track - * (4) increase loop count to "iteration" - * (which is current iteration) - * (5) get dirty log for current iteration, - * we'll see that page P is cleared, with - * value "iteration-1". + * value smaller than the iteration number. */ - TEST_ASSERT(*value_ptr <= iteration, - "Clear page %"PRIu64" value %"PRIu64 - " incorrect (iteration=%"PRIu64")", - page, *value_ptr, iteration); - if (*value_ptr == iteration) { - /* - * This page is _just_ modified; it - * should report its dirtyness in the - * next run - */ - __set_bit_le(page, host_bmap_track); - } + TEST_ASSERT(val < iteration, + "Clear page %lu value (%lu) >= iteration (%lu) " + "(last = %lu, prev_last = %lu)", + page, val, iteration, dirty_ring_last_page, + dirty_ring_prev_iteration_last_page); } } + + pr_info("Iteration %2ld: dirty: %-6lu clean: %-6lu writes: %-6lu\n", + iteration, nr_dirty_pages, nr_clean_pages, nr_writes); + + host_dirty_count += nr_dirty_pages; + host_clear_count += nr_clean_pages; } static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, @@ -688,7 +599,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct test_params *p = arg; struct kvm_vcpu *vcpu; struct kvm_vm *vm; - unsigned long *bmap; + unsigned long *bmap[2]; uint32_t ring_buf_idx = 0; int sem_val; @@ -731,12 +642,21 @@ static void run_test(enum vm_guest_mode mode, void *arg) #ifdef __s390x__ /* Align to 1M (segment size) */ guest_test_phys_mem = align_down(guest_test_phys_mem, 1 << 20); + + /* + * The workaround in guest_code() to write all pages prior to the first + * iteration isn't compatible with the dirty ring, as the dirty ring + * support relies on the vCPU to actually stop when vcpu_stop is set so + * that the vCPU doesn't hang waiting for the dirty ring to be emptied. + */ + TEST_ASSERT(host_log_mode != LOG_MODE_DIRTY_RING, + "Test needs to be updated to support s390 dirty ring"); #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - bmap = bitmap_zalloc(host_num_pages); - host_bmap_track = bitmap_zalloc(host_num_pages); + bmap[0] = bitmap_zalloc(host_num_pages); + bmap[1] = bitmap_zalloc(host_num_pages); /* Add an extra memory slot for testing dirty logging */ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, @@ -757,14 +677,9 @@ static void run_test(enum vm_guest_mode mode, void *arg) sync_global_to_guest(vm, guest_test_virt_mem); sync_global_to_guest(vm, guest_num_pages); - /* Start the iterations */ - iteration = 1; - sync_global_to_guest(vm, iteration); - WRITE_ONCE(host_quit, false); host_dirty_count = 0; host_clear_count = 0; - host_track_next_count = 0; - WRITE_ONCE(dirty_ring_vcpu_ring_full, false); + WRITE_ONCE(host_quit, false); /* * Ensure the previous iteration didn't leave a dangling semaphore, i.e. @@ -776,21 +691,95 @@ static void run_test(enum vm_guest_mode mode, void *arg) sem_getvalue(&sem_vcpu_cont, &sem_val); TEST_ASSERT_EQ(sem_val, 0); + TEST_ASSERT_EQ(vcpu_stop, false); + pthread_create(&vcpu_thread, NULL, vcpu_worker, vcpu); - while (iteration < p->iterations) { - /* Give the vcpu thread some time to dirty some pages */ - usleep(p->interval * 1000); - log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, - bmap, host_num_pages, - &ring_buf_idx); + for (iteration = 1; iteration <= p->iterations; iteration++) { + unsigned long i; + + sync_global_to_guest(vm, iteration); + + WRITE_ONCE(nr_writes, 0); + sync_global_to_guest(vm, nr_writes); + + dirty_ring_prev_iteration_last_page = dirty_ring_last_page; + WRITE_ONCE(dirty_ring_vcpu_ring_full, false); + + sem_post(&sem_vcpu_cont); + + /* + * Let the vCPU run beyond the configured interval until it has + * performed the minimum number of writes. This verifies the + * guest is making forward progress, e.g. isn't stuck because + * of a KVM bug, and puts a firm floor on test coverage. + */ + for (i = 0; i < p->interval || nr_writes < TEST_MIN_WRITES_PER_ITERATION; i++) { + /* + * Sleep in 1ms chunks to keep the interval math simple + * and so that the test doesn't run too far beyond the + * specified interval. + */ + usleep(1000); + + sync_global_from_guest(vm, nr_writes); + + /* + * Reap dirty pages while the guest is running so that + * dirty ring full events are resolved, i.e. so that a + * larger interval doesn't always end up with a vCPU + * that's effectively blocked. Collecting while the + * guest is running also verifies KVM doesn't lose any + * state. + * + * For bitmap modes, KVM overwrites the entire bitmap, + * i.e. collecting the bitmaps is destructive. Collect + * the bitmap only on the first pass, otherwise this + * test would lose track of dirty pages. + */ + if (i && host_log_mode != LOG_MODE_DIRTY_RING) + continue; + + /* + * For the dirty ring, empty the ring on subsequent + * passes only if the ring was filled at least once, + * to verify KVM's handling of a full ring (emptying + * the ring on every pass would make it unlikely the + * vCPU would ever fill the fing). + */ + if (i && !READ_ONCE(dirty_ring_vcpu_ring_full)) + continue; + + log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, + bmap[0], host_num_pages, + &ring_buf_idx); + } + + /* + * Stop the vCPU prior to collecting and verifying the dirty + * log. If the vCPU is allowed to run during collection, then + * pages that are written during this iteration may be missed, + * i.e. collected in the next iteration. And if the vCPU is + * writing memory during verification, pages that this thread + * sees as clean may be written with this iteration's value. + */ + WRITE_ONCE(vcpu_stop, true); + sync_global_to_guest(vm, vcpu_stop); + sem_wait(&sem_vcpu_stop); /* - * See vcpu_sync_stop_requested definition for details on why - * we need to stop vcpu when verify data. + * Clear vcpu_stop after the vCPU thread has acknowledge the + * stop request and is waiting, i.e. is definitely not running! */ - atomic_set(&vcpu_sync_stop_requested, true); - sem_wait_until(&sem_vcpu_stop); + WRITE_ONCE(vcpu_stop, false); + sync_global_to_guest(vm, vcpu_stop); + + /* + * Sync the number of writes performed before verification, the + * info will be printed along with the dirty/clean page counts. + */ + sync_global_from_guest(vm, nr_writes); + /* * NOTE: for dirty ring, it's possible that we didn't stop at * GUEST_SYNC but instead we stopped because ring is full; @@ -798,32 +787,22 @@ static void run_test(enum vm_guest_mode mode, void *arg) * the flush of the last page, and since we handle the last * page specially verification will succeed anyway. */ - assert(host_log_mode == LOG_MODE_DIRTY_RING || - atomic_read(&vcpu_sync_stop_requested) == false); + log_mode_collect_dirty_pages(vcpu, TEST_MEM_SLOT_INDEX, + bmap[1], host_num_pages, + &ring_buf_idx); vm_dirty_log_verify(mode, bmap); - - /* - * Set host_quit before sem_vcpu_cont in the final iteration to - * ensure that the vCPU worker doesn't resume the guest. As - * above, the dirty ring test may stop and wait even when not - * explicitly request to do so, i.e. would hang waiting for a - * "continue" if it's allowed to resume the guest. - */ - if (++iteration == p->iterations) - WRITE_ONCE(host_quit, true); - - sem_post(&sem_vcpu_cont); - sync_global_to_guest(vm, iteration); } + WRITE_ONCE(host_quit, true); + sem_post(&sem_vcpu_cont); + pthread_join(vcpu_thread, NULL); - pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), " - "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count, - host_track_next_count); + pr_info("Total bits checked: dirty (%lu), clear (%lu)\n", + host_dirty_count, host_clear_count); - free(bmap); - free(host_bmap_track); + free(bmap[0]); + free(bmap[1]); kvm_vm_free(vm); } @@ -857,7 +836,6 @@ int main(int argc, char *argv[]) .interval = TEST_HOST_LOOP_INTERVAL, }; int opt, i; - sigset_t sigset; sem_init(&sem_vcpu_stop, 0, 0); sem_init(&sem_vcpu_cont, 0, 0); @@ -908,19 +886,12 @@ int main(int argc, char *argv[]) } } - TEST_ASSERT(p.iterations > 2, "Iterations must be greater than two"); + TEST_ASSERT(p.iterations > 0, "Iterations must be greater than zero"); TEST_ASSERT(p.interval > 0, "Interval must be greater than zero"); pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n", p.iterations, p.interval); - srandom(time(0)); - - /* Ensure that vCPU threads start with SIG_IPI blocked. */ - sigemptyset(&sigset); - sigaddset(&sigset, SIG_IPI); - pthread_sigmask(SIG_BLOCK, &sigset, NULL); - if (host_log_mode_option == LOG_MODE_ALL) { /* Run each log mode */ for (i = 0; i < LOG_MODE_NUM; i++) { diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 4c4e5a847f67..373912464fb4 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -46,6 +46,12 @@ struct userspace_mem_region { struct hlist_node slot_node; }; +struct kvm_binary_stats { + int fd; + struct kvm_stats_header header; + struct kvm_stats_desc *desc; +}; + struct kvm_vcpu { struct list_head list; uint32_t id; @@ -55,6 +61,7 @@ struct kvm_vcpu { #ifdef __x86_64__ struct kvm_cpuid2 *cpuid; #endif + struct kvm_binary_stats stats; struct kvm_dirty_gfn *dirty_gfns; uint32_t fetch_index; uint32_t dirty_gfns_count; @@ -99,10 +106,7 @@ struct kvm_vm { struct kvm_vm_arch arch; - /* Cache of information for binary stats interface */ - int stats_fd; - struct kvm_stats_header stats_header; - struct kvm_stats_desc *stats_desc; + struct kvm_binary_stats stats; /* * KVM region slots. These are the default memslots used by page @@ -531,16 +535,19 @@ void read_stat_data(int stats_fd, struct kvm_stats_header *header, struct kvm_stats_desc *desc, uint64_t *data, size_t max_elements); -void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, - size_t max_elements); +void kvm_get_stat(struct kvm_binary_stats *stats, const char *name, + uint64_t *data, size_t max_elements); -static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) -{ - uint64_t data; +#define __get_stat(stats, stat) \ +({ \ + uint64_t data; \ + \ + kvm_get_stat(stats, #stat, &data, 1); \ + data; \ +}) - __vm_get_stat(vm, stat_name, &data, 1); - return data; -} +#define vm_get_stat(vm, stat) __get_stat(&(vm)->stats, stat) +#define vcpu_get_stat(vcpu, stat) __get_stat(&(vcpu)->stats, stat) void vm_create_irqchip(struct kvm_vm *vm); @@ -963,6 +970,8 @@ static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); +void kvm_set_files_rlimit(uint32_t nr_vcpus); + void kvm_pin_this_task_to_pcpu(uint32_t pcpu); void kvm_print_vcpu_pinning_help(void); void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 3e473058849f..77d13d7920cb 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -22,7 +22,7 @@ #define msecs_to_usecs(msec) ((msec) * 1000ULL) -static inline int _no_printf(const char *format, ...) { return 0; } +static inline __printf(1, 2) int _no_printf(const char *format, ...) { return 0; } #ifdef DEBUG #define pr_debug(...) printf(__VA_ARGS__) diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index d60da8966772..32ab6ca7ec32 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -183,6 +183,9 @@ struct kvm_x86_cpu_feature { * Extended Leafs, a.k.a. AMD defined */ #define X86_FEATURE_SVM KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 2) +#define X86_FEATURE_PERFCTR_CORE KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 23) +#define X86_FEATURE_PERFCTR_NB KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 24) +#define X86_FEATURE_PERFCTR_LLC KVM_X86_CPU_FEATURE(0x80000001, 0, ECX, 28) #define X86_FEATURE_NX KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 20) #define X86_FEATURE_GBPAGES KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 26) #define X86_FEATURE_RDTSCP KVM_X86_CPU_FEATURE(0x80000001, 0, EDX, 27) @@ -197,8 +200,11 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_PAUSEFILTER KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 10) #define X86_FEATURE_PFTHRESHOLD KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 12) #define X86_FEATURE_VGIF KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 16) +#define X86_FEATURE_IDLE_HLT KVM_X86_CPU_FEATURE(0x8000000A, 0, EDX, 30) #define X86_FEATURE_SEV KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 1) #define X86_FEATURE_SEV_ES KVM_X86_CPU_FEATURE(0x8000001F, 0, EAX, 3) +#define X86_FEATURE_PERFMON_V2 KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 0) +#define X86_FEATURE_LBR_PMC_FREEZE KVM_X86_CPU_FEATURE(0x80000022, 0, EAX, 2) /* * KVM defined paravirt features. @@ -285,6 +291,8 @@ struct kvm_x86_cpu_property { #define X86_PROPERTY_GUEST_MAX_PHY_ADDR KVM_X86_CPU_PROPERTY(0x80000008, 0, EAX, 16, 23) #define X86_PROPERTY_SEV_C_BIT KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 0, 5) #define X86_PROPERTY_PHYS_ADDR_REDUCTION KVM_X86_CPU_PROPERTY(0x8000001F, 0, EBX, 6, 11) +#define X86_PROPERTY_NR_PERFCTR_CORE KVM_X86_CPU_PROPERTY(0x80000022, 0, EBX, 0, 3) +#define X86_PROPERTY_NR_PERFCTR_NB KVM_X86_CPU_PROPERTY(0x80000022, 0, EBX, 10, 15) #define X86_PROPERTY_MAX_CENTAUR_LEAF KVM_X86_CPU_PROPERTY(0xC0000000, 0, EAX, 0, 31) @@ -1244,7 +1252,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, uint64_t ign_error_code; \ uint8_t vector; \ \ - asm volatile(KVM_ASM_SAFE(insn) \ + asm volatile(KVM_ASM_SAFE_FEP(insn) \ : KVM_ASM_SAFE_OUTPUTS(vector, ign_error_code) \ : inputs \ : KVM_ASM_SAFE_CLOBBERS); \ @@ -1339,6 +1347,46 @@ static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size, GUEST_ASSERT(!ret); } +/* + * Execute HLT in an STI interrupt shadow to ensure that a pending IRQ that's + * intended to be a wake event arrives *after* HLT is executed. Modern CPUs, + * except for a few oddballs that KVM is unlikely to run on, block IRQs for one + * instruction after STI, *if* RFLAGS.IF=0 before STI. Note, Intel CPUs may + * block other events beyond regular IRQs, e.g. may block NMIs and SMIs too. + */ +static inline void safe_halt(void) +{ + asm volatile("sti; hlt"); +} + +/* + * Enable interrupts and ensure that interrupts are evaluated upon return from + * this function, i.e. execute a nop to consume the STi interrupt shadow. + */ +static inline void sti_nop(void) +{ + asm volatile ("sti; nop"); +} + +/* + * Enable interrupts for one instruction (nop), to allow the CPU to process all + * interrupts that are already pending. + */ +static inline void sti_nop_cli(void) +{ + asm volatile ("sti; nop; cli"); +} + +static inline void sti(void) +{ + asm volatile("sti"); +} + +static inline void cli(void) +{ + asm volatile ("cli"); +} + void __vm_xsave_require_permission(uint64_t xfeature, const char *name); #define vm_xsave_require_permission(xfeature) \ diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index c78f34699f73..c5310736ed06 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -10,7 +10,6 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <sys/resource.h> #include "test_util.h" @@ -39,36 +38,11 @@ int main(int argc, char *argv[]) { int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); - /* - * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds + - * an arbitrary number for everything else. - */ - int nr_fds_wanted = kvm_max_vcpus + 100; - struct rlimit rl; pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); - /* - * Check that we're allowed to open nr_fds_wanted file descriptors and - * try raising the limits if needed. - */ - TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); - - if (rl.rlim_cur < nr_fds_wanted) { - rl.rlim_cur = nr_fds_wanted; - if (rl.rlim_max < nr_fds_wanted) { - int old_rlim_max = rl.rlim_max; - rl.rlim_max = nr_fds_wanted; - - int r = setrlimit(RLIMIT_NOFILE, &rl); - __TEST_REQUIRE(r >= 0, - "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)", - old_rlim_max, nr_fds_wanted); - } else { - TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); - } - } + kvm_set_files_rlimit(kvm_max_vcpus); /* * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 33fefeb3ca44..279ad8946040 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -12,6 +12,7 @@ #include <assert.h> #include <sched.h> #include <sys/mman.h> +#include <sys/resource.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> @@ -196,6 +197,11 @@ static void vm_open(struct kvm_vm *vm) vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type); TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd)); + + if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) + vm->stats.fd = vm_get_stats_fd(vm); + else + vm->stats.fd = -1; } const char *vm_guest_mode_string(uint32_t i) @@ -406,6 +412,38 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, return vm_adjust_num_guest_pages(mode, nr_pages); } +void kvm_set_files_rlimit(uint32_t nr_vcpus) +{ + /* + * Each vCPU will open two file descriptors: the vCPU itself and the + * vCPU's binary stats file descriptor. Add an arbitrary amount of + * buffer for all other files a test may open. + */ + int nr_fds_wanted = nr_vcpus * 2 + 100; + struct rlimit rl; + + /* + * Check that we're allowed to open nr_fds_wanted file descriptors and + * try raising the limits if needed. + */ + TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); + + if (rl.rlim_cur < nr_fds_wanted) { + rl.rlim_cur = nr_fds_wanted; + if (rl.rlim_max < nr_fds_wanted) { + int old_rlim_max = rl.rlim_max; + + rl.rlim_max = nr_fds_wanted; + __TEST_REQUIRE(setrlimit(RLIMIT_NOFILE, &rl) >= 0, + "RLIMIT_NOFILE hard limit is too low (%d, wanted %d)", + old_rlim_max, nr_fds_wanted); + } else { + TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); + } + } + +} + struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages) { @@ -415,6 +453,8 @@ struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, struct kvm_vm *vm; int i; + kvm_set_files_rlimit(nr_runnable_vcpus); + pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__, vm_guest_mode_string(shape.mode), shape.type, nr_pages); @@ -657,6 +697,23 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) return NULL; } +static void kvm_stats_release(struct kvm_binary_stats *stats) +{ + int ret; + + if (stats->fd < 0) + return; + + if (stats->desc) { + free(stats->desc); + stats->desc = NULL; + } + + ret = close(stats->fd); + TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + stats->fd = -1; +} + __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) { @@ -690,6 +747,8 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) ret = close(vcpu->fd); TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_stats_release(&vcpu->stats); + list_del(&vcpu->list); vcpu_arch_free(vcpu); @@ -709,6 +768,9 @@ void kvm_vm_release(struct kvm_vm *vmp) ret = close(vmp->kvm_fd); TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + + /* Free cached stats metadata and close FD */ + kvm_stats_release(&vmp->stats); } static void __vm_mem_region_delete(struct kvm_vm *vm, @@ -748,12 +810,6 @@ void kvm_vm_free(struct kvm_vm *vmp) if (vmp == NULL) return; - /* Free cached stats metadata and close FD */ - if (vmp->stats_fd) { - free(vmp->stats_desc); - close(vmp->stats_fd); - } - /* Free userspace_mem_regions. */ hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node) __vm_mem_region_delete(vmp, region); @@ -1286,6 +1342,11 @@ struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id) TEST_ASSERT(vcpu->run != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED)); + if (kvm_has_cap(KVM_CAP_BINARY_STATS_FD)) + vcpu->stats.fd = vcpu_get_stats_fd(vcpu); + else + vcpu->stats.fd = -1; + /* Add to linked-list of VCPUs. */ list_add(&vcpu->list, &vm->vcpus); @@ -2198,46 +2259,31 @@ void read_stat_data(int stats_fd, struct kvm_stats_header *header, desc->name, size, ret); } -/* - * Read the data of the named stat - * - * Input Args: - * vm - the VM for which the stat should be read - * stat_name - the name of the stat to read - * max_elements - the maximum number of 8-byte values to read into data - * - * Output Args: - * data - the buffer into which stat data should be read - * - * Read the data values of a specified stat from the binary stats interface. - */ -void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data, - size_t max_elements) +void kvm_get_stat(struct kvm_binary_stats *stats, const char *name, + uint64_t *data, size_t max_elements) { struct kvm_stats_desc *desc; size_t size_desc; int i; - if (!vm->stats_fd) { - vm->stats_fd = vm_get_stats_fd(vm); - read_stats_header(vm->stats_fd, &vm->stats_header); - vm->stats_desc = read_stats_descriptors(vm->stats_fd, - &vm->stats_header); + if (!stats->desc) { + read_stats_header(stats->fd, &stats->header); + stats->desc = read_stats_descriptors(stats->fd, &stats->header); } - size_desc = get_stats_descriptor_size(&vm->stats_header); + size_desc = get_stats_descriptor_size(&stats->header); - for (i = 0; i < vm->stats_header.num_desc; ++i) { - desc = (void *)vm->stats_desc + (i * size_desc); + for (i = 0; i < stats->header.num_desc; ++i) { + desc = (void *)stats->desc + (i * size_desc); - if (strcmp(desc->name, stat_name)) + if (strcmp(desc->name, name)) continue; - read_stat_data(vm->stats_fd, &vm->stats_header, desc, - data, max_elements); - - break; + read_stat_data(stats->fd, &stats->header, desc, data, max_elements); + return; } + + TEST_FAIL("Unable to find stat '%s'", name); } __weak void kvm_arch_vm_post_create(struct kvm_vm *vm) diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index 7c9de8414462..5bde176cedd5 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -114,7 +114,7 @@ struct uffd_desc *uffd_setup_demand_paging(int uffd_mode, useconds_t delay, PER_PAGE_DEBUG("Userfaultfd %s mode, faults resolved with %s\n", is_minor ? "MINOR" : "MISSING", - is_minor ? "UFFDIO_CONINUE" : "UFFDIO_COPY"); + is_minor ? "UFFDIO_CONTINUE" : "UFFDIO_COPY"); uffd_desc = malloc(sizeof(struct uffd_desc)); TEST_ASSERT(uffd_desc, "Failed to malloc uffd descriptor"); diff --git a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c index f45c0ecc902d..03406de4989d 100644 --- a/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c +++ b/tools/testing/selftests/kvm/riscv/sbi_pmu_test.c @@ -39,7 +39,13 @@ static bool illegal_handler_invoked; #define SBI_PMU_TEST_SNAPSHOT BIT(2) #define SBI_PMU_TEST_OVERFLOW BIT(3) -static int disabled_tests; +#define SBI_PMU_OVERFLOW_IRQNUM_DEFAULT 5 +struct test_args { + int disabled_tests; + int overflow_irqnum; +}; + +static struct test_args targs; unsigned long pmu_csr_read_num(int csr_num) { @@ -118,8 +124,8 @@ static void stop_counter(unsigned long counter, unsigned long stop_flags) ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, counter, 1, stop_flags, 0, 0, 0); - __GUEST_ASSERT(ret.error == 0, "Unable to stop counter %ld error %ld\n", - counter, ret.error); + __GUEST_ASSERT(ret.error == 0 || ret.error == SBI_ERR_ALREADY_STOPPED, + "Unable to stop counter %ld error %ld\n", counter, ret.error); } static void guest_illegal_exception_handler(struct ex_regs *regs) @@ -137,7 +143,6 @@ static void guest_irq_handler(struct ex_regs *regs) unsigned int irq_num = regs->cause & ~CAUSE_IRQ_FLAG; struct riscv_pmu_snapshot_data *snapshot_data = snapshot_gva; unsigned long overflown_mask; - unsigned long counter_val = 0; /* Validate that we are in the correct irq handler */ GUEST_ASSERT_EQ(irq_num, IRQ_PMU_OVF); @@ -151,10 +156,6 @@ static void guest_irq_handler(struct ex_regs *regs) GUEST_ASSERT(overflown_mask & 0x01); WRITE_ONCE(vcpu_shared_irq_count, vcpu_shared_irq_count+1); - - counter_val = READ_ONCE(snapshot_data->ctr_values[0]); - /* Now start the counter to mimick the real driver behavior */ - start_counter(counter_in_use, SBI_PMU_START_FLAG_SET_INIT_VALUE, counter_val); } static unsigned long get_counter_index(unsigned long cbase, unsigned long cmask, @@ -479,7 +480,7 @@ static void test_pmu_events_snaphost(void) static void test_pmu_events_overflow(void) { - int num_counters = 0; + int num_counters = 0, i = 0; /* Verify presence of SBI PMU and minimum requrired SBI version */ verify_sbi_requirement_assert(); @@ -496,11 +497,15 @@ static void test_pmu_events_overflow(void) * Qemu supports overflow for cycle/instruction. * This test may fail on any platform that do not support overflow for these two events. */ - test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); - GUEST_ASSERT_EQ(vcpu_shared_irq_count, 1); + for (i = 0; i < targs.overflow_irqnum; i++) + test_pmu_event_overflow(SBI_PMU_HW_CPU_CYCLES); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, targs.overflow_irqnum); + + vcpu_shared_irq_count = 0; - test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); - GUEST_ASSERT_EQ(vcpu_shared_irq_count, 2); + for (i = 0; i < targs.overflow_irqnum; i++) + test_pmu_event_overflow(SBI_PMU_HW_INSTRUCTIONS); + GUEST_ASSERT_EQ(vcpu_shared_irq_count, targs.overflow_irqnum); GUEST_DONE(); } @@ -609,7 +614,11 @@ static void test_vm_events_overflow(void *guest_code) vcpu_init_vector_tables(vcpu); /* Initialize guest timer frequency. */ timer_freq = vcpu_get_reg(vcpu, RISCV_TIMER_REG(frequency)); + + /* Export the shared variables to the guest */ sync_global_to_guest(vm, timer_freq); + sync_global_to_guest(vm, vcpu_shared_irq_count); + sync_global_to_guest(vm, targs); run_vcpu(vcpu); @@ -618,28 +627,38 @@ static void test_vm_events_overflow(void *guest_code) static void test_print_help(char *name) { - pr_info("Usage: %s [-h] [-d <test name>]\n", name); - pr_info("\t-d: Test to disable. Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("Usage: %s [-h] [-t <test name>] [-n <number of LCOFI interrupt for overflow test>]\n", + name); + pr_info("\t-t: Test to run (default all). Available tests are 'basic', 'events', 'snapshot', 'overflow'\n"); + pr_info("\t-n: Number of LCOFI interrupt to trigger for each event in overflow test (default: %d)\n", + SBI_PMU_OVERFLOW_IRQNUM_DEFAULT); pr_info("\t-h: print this help screen\n"); } static bool parse_args(int argc, char *argv[]) { int opt; + int temp_disabled_tests = SBI_PMU_TEST_BASIC | SBI_PMU_TEST_EVENTS | SBI_PMU_TEST_SNAPSHOT | + SBI_PMU_TEST_OVERFLOW; + int overflow_interrupts = 0; - while ((opt = getopt(argc, argv, "hd:")) != -1) { + while ((opt = getopt(argc, argv, "ht:n:")) != -1) { switch (opt) { - case 'd': + case 't': if (!strncmp("basic", optarg, 5)) - disabled_tests |= SBI_PMU_TEST_BASIC; + temp_disabled_tests &= ~SBI_PMU_TEST_BASIC; else if (!strncmp("events", optarg, 6)) - disabled_tests |= SBI_PMU_TEST_EVENTS; + temp_disabled_tests &= ~SBI_PMU_TEST_EVENTS; else if (!strncmp("snapshot", optarg, 8)) - disabled_tests |= SBI_PMU_TEST_SNAPSHOT; + temp_disabled_tests &= ~SBI_PMU_TEST_SNAPSHOT; else if (!strncmp("overflow", optarg, 8)) - disabled_tests |= SBI_PMU_TEST_OVERFLOW; + temp_disabled_tests &= ~SBI_PMU_TEST_OVERFLOW; else goto done; + targs.disabled_tests = temp_disabled_tests; + break; + case 'n': + overflow_interrupts = atoi_positive("Number of LCOFI", optarg); break; case 'h': default: @@ -647,6 +666,15 @@ static bool parse_args(int argc, char *argv[]) } } + if (overflow_interrupts > 0) { + if (targs.disabled_tests & SBI_PMU_TEST_OVERFLOW) { + pr_info("-n option is only available for overflow test\n"); + goto done; + } else { + targs.overflow_irqnum = overflow_interrupts; + } + } + return true; done: test_print_help(argv[0]); @@ -655,25 +683,28 @@ done: int main(int argc, char *argv[]) { + targs.disabled_tests = 0; + targs.overflow_irqnum = SBI_PMU_OVERFLOW_IRQNUM_DEFAULT; + if (!parse_args(argc, argv)) exit(KSFT_SKIP); - if (!(disabled_tests & SBI_PMU_TEST_BASIC)) { + if (!(targs.disabled_tests & SBI_PMU_TEST_BASIC)) { test_vm_basic_test(test_pmu_basic_sanity); pr_info("SBI PMU basic test : PASS\n"); } - if (!(disabled_tests & SBI_PMU_TEST_EVENTS)) { + if (!(targs.disabled_tests & SBI_PMU_TEST_EVENTS)) { test_vm_events_test(test_pmu_events); pr_info("SBI PMU event verification test : PASS\n"); } - if (!(disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { + if (!(targs.disabled_tests & SBI_PMU_TEST_SNAPSHOT)) { test_vm_events_snapshot_test(test_pmu_events_snaphost); pr_info("SBI PMU event verification with snapshot test : PASS\n"); } - if (!(disabled_tests & SBI_PMU_TEST_OVERFLOW)) { + if (!(targs.disabled_tests & SBI_PMU_TEST_OVERFLOW)) { test_vm_events_overflow(test_pmu_events_overflow); pr_info("SBI PMU event verification with overflow test : PASS\n"); } diff --git a/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c index 2929c067c207..b0d2b04a7ff2 100644 --- a/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c +++ b/tools/testing/selftests/kvm/x86/dirty_log_page_splitting_test.c @@ -41,9 +41,9 @@ struct kvm_page_stats { static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage) { - stats->pages_4k = vm_get_stat(vm, "pages_4k"); - stats->pages_2m = vm_get_stat(vm, "pages_2m"); - stats->pages_1g = vm_get_stat(vm, "pages_1g"); + stats->pages_4k = vm_get_stat(vm, pages_4k); + stats->pages_2m = vm_get_stat(vm, pages_2m); + stats->pages_1g = vm_get_stat(vm, pages_1g); stats->hugepages = stats->pages_2m + stats->pages_1g; pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n", diff --git a/tools/testing/selftests/kvm/x86/hyperv_ipi.c b/tools/testing/selftests/kvm/x86/hyperv_ipi.c index 22c0c124582f..2b5b4bc6ef7e 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86/hyperv_ipi.c @@ -63,8 +63,10 @@ static void receiver_code(void *hcall_page, vm_vaddr_t pgs_gpa) /* Signal sender vCPU we're ready */ ipis_rcvd[vcpu_id] = (u64)-1; - for (;;) - asm volatile("sti; hlt; cli"); + for (;;) { + safe_halt(); + cli(); + } } static void guest_ipi_handler(struct ex_regs *regs) diff --git a/tools/testing/selftests/kvm/x86/nested_emulation_test.c b/tools/testing/selftests/kvm/x86/nested_emulation_test.c new file mode 100644 index 000000000000..abc824dba04f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_emulation_test.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +enum { + SVM_F, + VMX_F, + NR_VIRTUALIZATION_FLAVORS, +}; + +struct emulated_instruction { + const char name[32]; + uint8_t opcode[15]; + uint32_t exit_reason[NR_VIRTUALIZATION_FLAVORS]; +}; + +static struct emulated_instruction instructions[] = { + { + .name = "pause", + .opcode = { 0xf3, 0x90 }, + .exit_reason = { SVM_EXIT_PAUSE, + EXIT_REASON_PAUSE_INSTRUCTION, } + }, + { + .name = "hlt", + .opcode = { 0xf4 }, + .exit_reason = { SVM_EXIT_HLT, + EXIT_REASON_HLT, } + }, +}; + +static uint8_t kvm_fep[] = { 0x0f, 0x0b, 0x6b, 0x76, 0x6d }; /* ud2 ; .ascii "kvm" */ +static uint8_t l2_guest_code[sizeof(kvm_fep) + 15]; +static uint8_t *l2_instruction = &l2_guest_code[sizeof(kvm_fep)]; + +static uint32_t get_instruction_length(struct emulated_instruction *insn) +{ + uint32_t i; + + for (i = 0; i < ARRAY_SIZE(insn->opcode) && insn->opcode[i]; i++) + ; + + return i; +} + +static void guest_code(void *test_data) +{ + int f = this_cpu_has(X86_FEATURE_SVM) ? SVM_F : VMX_F; + int i; + + memcpy(l2_guest_code, kvm_fep, sizeof(kvm_fep)); + + if (f == SVM_F) { + struct svm_test_data *svm = test_data; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, NULL, NULL); + vmcb->save.idtr.limit = 0; + vmcb->save.rip = (u64)l2_guest_code; + + vmcb->control.intercept |= BIT_ULL(INTERCEPT_SHUTDOWN) | + BIT_ULL(INTERCEPT_PAUSE) | + BIT_ULL(INTERCEPT_HLT); + vmcb->control.intercept_exceptions = 0; + } else { + GUEST_ASSERT(prepare_for_vmx_operation(test_data)); + GUEST_ASSERT(load_vmcs(test_data)); + + prepare_vmcs(test_data, NULL, NULL); + GUEST_ASSERT(!vmwrite(GUEST_IDTR_LIMIT, 0)); + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); + GUEST_ASSERT(!vmwrite(EXCEPTION_BITMAP, 0)); + + vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | + CPU_BASED_PAUSE_EXITING | + CPU_BASED_HLT_EXITING); + } + + for (i = 0; i < ARRAY_SIZE(instructions); i++) { + struct emulated_instruction *insn = &instructions[i]; + uint32_t insn_len = get_instruction_length(insn); + uint32_t exit_insn_len; + u32 exit_reason; + + /* + * Copy the target instruction to the L2 code stream, and fill + * the remaining bytes with INT3s so that a missed intercept + * results in a consistent failure mode (SHUTDOWN). + */ + memcpy(l2_instruction, insn->opcode, insn_len); + memset(l2_instruction + insn_len, 0xcc, sizeof(insn->opcode) - insn_len); + + if (f == SVM_F) { + struct svm_test_data *svm = test_data; + struct vmcb *vmcb = svm->vmcb; + + run_guest(vmcb, svm->vmcb_gpa); + exit_reason = vmcb->control.exit_code; + exit_insn_len = vmcb->control.next_rip - vmcb->save.rip; + GUEST_ASSERT_EQ(vmcb->save.rip, (u64)l2_instruction); + } else { + GUEST_ASSERT_EQ(i ? vmresume() : vmlaunch(), 0); + exit_reason = vmreadz(VM_EXIT_REASON); + exit_insn_len = vmreadz(VM_EXIT_INSTRUCTION_LEN); + GUEST_ASSERT_EQ(vmreadz(GUEST_RIP), (u64)l2_instruction); + } + + __GUEST_ASSERT(exit_reason == insn->exit_reason[f], + "Wanted exit_reason '0x%x' for '%s', got '0x%x'", + insn->exit_reason[f], insn->name, exit_reason); + + __GUEST_ASSERT(exit_insn_len == insn_len, + "Wanted insn_len '%u' for '%s', got '%u'", + insn_len, insn->name, exit_insn_len); + } + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(is_forced_emulation_enabled); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c index e7efb2b35f8b..c0d84827f736 100644 --- a/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86/nx_huge_pages_test.c @@ -73,7 +73,7 @@ static void check_2m_page_count(struct kvm_vm *vm, int expected_pages_2m) { int actual_pages_2m; - actual_pages_2m = vm_get_stat(vm, "pages_2m"); + actual_pages_2m = vm_get_stat(vm, pages_2m); TEST_ASSERT(actual_pages_2m == expected_pages_2m, "Unexpected 2m page count. Expected %d, got %d", @@ -84,7 +84,7 @@ static void check_split_count(struct kvm_vm *vm, int expected_splits) { int actual_splits; - actual_splits = vm_get_stat(vm, "nx_lpage_splits"); + actual_splits = vm_get_stat(vm, nx_lpage_splits); TEST_ASSERT(actual_splits == expected_splits, "Unexpected NX huge page split count. Expected %d, got %d", diff --git a/tools/testing/selftests/kvm/x86/pmu_counters_test.c b/tools/testing/selftests/kvm/x86/pmu_counters_test.c index 698cb36989db..8aaaf25b6111 100644 --- a/tools/testing/selftests/kvm/x86/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86/pmu_counters_test.c @@ -17,7 +17,7 @@ * Number of instructions in each loop. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, * 1 LOOP. */ -#define NUM_INSNS_PER_LOOP 3 +#define NUM_INSNS_PER_LOOP 4 /* * Number of "extra" instructions that will be counted, i.e. the number of @@ -29,10 +29,59 @@ /* Total number of instructions retired within the measured section. */ #define NUM_INSNS_RETIRED (NUM_LOOPS * NUM_INSNS_PER_LOOP + NUM_EXTRA_INSNS) +/* Track which architectural events are supported by hardware. */ +static uint32_t hardware_pmu_arch_events; static uint8_t kvm_pmu_version; static bool kvm_has_perf_caps; +#define X86_PMU_FEATURE_NULL \ +({ \ + struct kvm_x86_pmu_feature feature = {}; \ + \ + feature; \ +}) + +static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event) +{ + return !(*(u64 *)&event); +} + +struct kvm_intel_pmu_event { + struct kvm_x86_pmu_feature gp_event; + struct kvm_x86_pmu_feature fixed_event; +}; + +/* + * Wrap the array to appease the compiler, as the macros used to construct each + * kvm_x86_pmu_feature use syntax that's only valid in function scope, and the + * compiler often thinks the feature definitions aren't compile-time constants. + */ +static struct kvm_intel_pmu_event intel_event_to_feature(uint8_t idx) +{ + const struct kvm_intel_pmu_event __intel_event_to_feature[] = { + [INTEL_ARCH_CPU_CYCLES_INDEX] = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED }, + [INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX] = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED }, + /* + * Note, the fixed counter for reference cycles is NOT the same as the + * general purpose architectural event. The fixed counter explicitly + * counts at the same frequency as the TSC, whereas the GP event counts + * at a fixed, but uarch specific, frequency. Bundle them here for + * simplicity. + */ + [INTEL_ARCH_REFERENCE_CYCLES_INDEX] = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED }, + [INTEL_ARCH_LLC_REFERENCES_INDEX] = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_LLC_MISSES_INDEX] = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_BRANCHES_RETIRED_INDEX] = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL }, + [INTEL_ARCH_TOPDOWN_SLOTS_INDEX] = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED }, + }; + + kvm_static_assert(ARRAY_SIZE(__intel_event_to_feature) == NR_INTEL_ARCH_EVENTS); + + return __intel_event_to_feature[idx]; +} + static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code, uint8_t pmu_version, @@ -42,6 +91,7 @@ static struct kvm_vm *pmu_vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, vm = vm_create_with_one_vcpu(vcpu, guest_code); sync_global_to_guest(vm, kvm_pmu_version); + sync_global_to_guest(vm, hardware_pmu_arch_events); /* * Set PERF_CAPABILITIES before PMU version as KVM disallows enabling @@ -98,14 +148,12 @@ static uint8_t guest_get_pmu_version(void) * Sanity check that in all cases, the event doesn't count when it's disabled, * and that KVM correctly emulates the write of an arbitrary value. */ -static void guest_assert_event_count(uint8_t idx, - struct kvm_x86_pmu_feature event, - uint32_t pmc, uint32_t pmc_msr) +static void guest_assert_event_count(uint8_t idx, uint32_t pmc, uint32_t pmc_msr) { uint64_t count; count = _rdpmc(pmc); - if (!this_pmu_has(event)) + if (!(hardware_pmu_arch_events & BIT(idx))) goto sanity_checks; switch (idx) { @@ -126,7 +174,9 @@ static void guest_assert_event_count(uint8_t idx, GUEST_ASSERT_NE(count, 0); break; case INTEL_ARCH_TOPDOWN_SLOTS_INDEX: - GUEST_ASSERT(count >= NUM_INSNS_RETIRED); + __GUEST_ASSERT(count >= NUM_INSNS_RETIRED, + "Expected top-down slots >= %u, got count = %lu", + NUM_INSNS_RETIRED, count); break; default: break; @@ -162,75 +212,42 @@ do { \ "1:\n\t" \ clflush "\n\t" \ "mfence\n\t" \ + "mov %[m], %%eax\n\t" \ FEP "loop 1b\n\t" \ FEP "mov %%edi, %%ecx\n\t" \ FEP "xor %%eax, %%eax\n\t" \ FEP "xor %%edx, %%edx\n\t" \ "wrmsr\n\t" \ :: "a"((uint32_t)_value), "d"(_value >> 32), \ - "c"(_msr), "D"(_msr) \ + "c"(_msr), "D"(_msr), [m]"m"(kvm_pmu_version) \ ); \ } while (0) -#define GUEST_TEST_EVENT(_idx, _event, _pmc, _pmc_msr, _ctrl_msr, _value, FEP) \ +#define GUEST_TEST_EVENT(_idx, _pmc, _pmc_msr, _ctrl_msr, _value, FEP) \ do { \ - wrmsr(pmc_msr, 0); \ + wrmsr(_pmc_msr, 0); \ \ if (this_cpu_has(X86_FEATURE_CLFLUSHOPT)) \ - GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt .", FEP); \ + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflushopt %[m]", FEP); \ else if (this_cpu_has(X86_FEATURE_CLFLUSH)) \ - GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush .", FEP); \ + GUEST_MEASURE_EVENT(_ctrl_msr, _value, "clflush %[m]", FEP); \ else \ GUEST_MEASURE_EVENT(_ctrl_msr, _value, "nop", FEP); \ \ - guest_assert_event_count(_idx, _event, _pmc, _pmc_msr); \ + guest_assert_event_count(_idx, _pmc, _pmc_msr); \ } while (0) -static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event, - uint32_t pmc, uint32_t pmc_msr, +static void __guest_test_arch_event(uint8_t idx, uint32_t pmc, uint32_t pmc_msr, uint32_t ctrl_msr, uint64_t ctrl_msr_value) { - GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, ""); + GUEST_TEST_EVENT(idx, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, ""); if (is_forced_emulation_enabled) - GUEST_TEST_EVENT(idx, event, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP); -} - -#define X86_PMU_FEATURE_NULL \ -({ \ - struct kvm_x86_pmu_feature feature = {}; \ - \ - feature; \ -}) - -static bool pmu_is_null_feature(struct kvm_x86_pmu_feature event) -{ - return !(*(u64 *)&event); + GUEST_TEST_EVENT(idx, pmc, pmc_msr, ctrl_msr, ctrl_msr_value, KVM_FEP); } static void guest_test_arch_event(uint8_t idx) { - const struct { - struct kvm_x86_pmu_feature gp_event; - struct kvm_x86_pmu_feature fixed_event; - } intel_event_to_feature[] = { - [INTEL_ARCH_CPU_CYCLES_INDEX] = { X86_PMU_FEATURE_CPU_CYCLES, X86_PMU_FEATURE_CPU_CYCLES_FIXED }, - [INTEL_ARCH_INSTRUCTIONS_RETIRED_INDEX] = { X86_PMU_FEATURE_INSNS_RETIRED, X86_PMU_FEATURE_INSNS_RETIRED_FIXED }, - /* - * Note, the fixed counter for reference cycles is NOT the same - * as the general purpose architectural event. The fixed counter - * explicitly counts at the same frequency as the TSC, whereas - * the GP event counts at a fixed, but uarch specific, frequency. - * Bundle them here for simplicity. - */ - [INTEL_ARCH_REFERENCE_CYCLES_INDEX] = { X86_PMU_FEATURE_REFERENCE_CYCLES, X86_PMU_FEATURE_REFERENCE_TSC_CYCLES_FIXED }, - [INTEL_ARCH_LLC_REFERENCES_INDEX] = { X86_PMU_FEATURE_LLC_REFERENCES, X86_PMU_FEATURE_NULL }, - [INTEL_ARCH_LLC_MISSES_INDEX] = { X86_PMU_FEATURE_LLC_MISSES, X86_PMU_FEATURE_NULL }, - [INTEL_ARCH_BRANCHES_RETIRED_INDEX] = { X86_PMU_FEATURE_BRANCH_INSNS_RETIRED, X86_PMU_FEATURE_NULL }, - [INTEL_ARCH_BRANCHES_MISPREDICTED_INDEX] = { X86_PMU_FEATURE_BRANCHES_MISPREDICTED, X86_PMU_FEATURE_NULL }, - [INTEL_ARCH_TOPDOWN_SLOTS_INDEX] = { X86_PMU_FEATURE_TOPDOWN_SLOTS, X86_PMU_FEATURE_TOPDOWN_SLOTS_FIXED }, - }; - uint32_t nr_gp_counters = this_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); uint32_t pmu_version = guest_get_pmu_version(); /* PERF_GLOBAL_CTRL exists only for Architectural PMU Version 2+. */ @@ -248,7 +265,7 @@ static void guest_test_arch_event(uint8_t idx) else base_pmc_msr = MSR_IA32_PERFCTR0; - gp_event = intel_event_to_feature[idx].gp_event; + gp_event = intel_event_to_feature(idx).gp_event; GUEST_ASSERT_EQ(idx, gp_event.f.bit); GUEST_ASSERT(nr_gp_counters); @@ -262,14 +279,14 @@ static void guest_test_arch_event(uint8_t idx) if (guest_has_perf_global_ctrl) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, BIT_ULL(i)); - __guest_test_arch_event(idx, gp_event, i, base_pmc_msr + i, + __guest_test_arch_event(idx, i, base_pmc_msr + i, MSR_P6_EVNTSEL0 + i, eventsel); } if (!guest_has_perf_global_ctrl) return; - fixed_event = intel_event_to_feature[idx].fixed_event; + fixed_event = intel_event_to_feature(idx).fixed_event; if (pmu_is_null_feature(fixed_event) || !this_pmu_has(fixed_event)) return; @@ -277,7 +294,7 @@ static void guest_test_arch_event(uint8_t idx) wrmsr(MSR_CORE_PERF_FIXED_CTR_CTRL, FIXED_PMC_CTRL(i, FIXED_PMC_KERNEL)); - __guest_test_arch_event(idx, fixed_event, i | INTEL_RDPMC_FIXED, + __guest_test_arch_event(idx, i | INTEL_RDPMC_FIXED, MSR_CORE_PERF_FIXED_CTR0 + i, MSR_CORE_PERF_GLOBAL_CTRL, FIXED_PMC_GLOBAL_CTRL_ENABLE(i)); @@ -331,9 +348,9 @@ __GUEST_ASSERT(expect_gp ? vector == GP_VECTOR : !vector, \ expect_gp ? "#GP" : "no fault", msr, vector) \ #define GUEST_ASSERT_PMC_VALUE(insn, msr, val, expected) \ - __GUEST_ASSERT(val == expected_val, \ + __GUEST_ASSERT(val == expected, \ "Expected " #insn "(0x%x) to yield 0x%lx, got 0x%lx", \ - msr, expected_val, val); + msr, expected, val); static void guest_test_rdpmc(uint32_t rdpmc_idx, bool expect_success, uint64_t expected_val) @@ -545,7 +562,6 @@ static void test_fixed_counters(uint8_t pmu_version, uint64_t perf_capabilities, static void test_intel_counters(void) { - uint8_t nr_arch_events = kvm_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH); uint8_t nr_fixed_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_FIXED_COUNTERS); uint8_t nr_gp_counters = kvm_cpu_property(X86_PROPERTY_PMU_NR_GP_COUNTERS); uint8_t pmu_version = kvm_cpu_property(X86_PROPERTY_PMU_VERSION); @@ -567,18 +583,26 @@ static void test_intel_counters(void) /* * Detect the existence of events that aren't supported by selftests. - * This will (obviously) fail any time the kernel adds support for a - * new event, but it's worth paying that price to keep the test fresh. + * This will (obviously) fail any time hardware adds support for a new + * event, but it's worth paying that price to keep the test fresh. */ - TEST_ASSERT(nr_arch_events <= NR_INTEL_ARCH_EVENTS, + TEST_ASSERT(this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH) <= NR_INTEL_ARCH_EVENTS, "New architectural event(s) detected; please update this test (length = %u, mask = %x)", - nr_arch_events, kvm_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK)); + this_cpu_property(X86_PROPERTY_PMU_EBX_BIT_VECTOR_LENGTH), + this_cpu_property(X86_PROPERTY_PMU_EVENTS_MASK)); /* - * Force iterating over known arch events regardless of whether or not - * KVM/hardware supports a given event. + * Iterate over known arch events irrespective of KVM/hardware support + * to verify that KVM doesn't reject programming of events just because + * the *architectural* encoding is unsupported. Track which events are + * supported in hardware; the guest side will validate supported events + * count correctly, even if *enumeration* of the event is unsupported + * by KVM and/or isn't exposed to the guest. */ - nr_arch_events = max_t(typeof(nr_arch_events), nr_arch_events, NR_INTEL_ARCH_EVENTS); + for (i = 0; i < NR_INTEL_ARCH_EVENTS; i++) { + if (this_pmu_has(intel_event_to_feature(i).gp_event)) + hardware_pmu_arch_events |= BIT(i); + } for (v = 0; v <= max_pmu_version; v++) { for (i = 0; i < ARRAY_SIZE(perf_caps); i++) { @@ -594,8 +618,8 @@ static void test_intel_counters(void) * vector length. */ if (v == pmu_version) { - for (k = 1; k < (BIT(nr_arch_events) - 1); k++) - test_arch_events(v, perf_caps[i], nr_arch_events, k); + for (k = 1; k < (BIT(NR_INTEL_ARCH_EVENTS) - 1); k++) + test_arch_events(v, perf_caps[i], NR_INTEL_ARCH_EVENTS, k); } /* * Test single bits for all PMU version and lengths up @@ -604,11 +628,11 @@ static void test_intel_counters(void) * host length). Explicitly test a mask of '0' and all * ones i.e. all events being available and unavailable. */ - for (j = 0; j <= nr_arch_events + 1; j++) { + for (j = 0; j <= NR_INTEL_ARCH_EVENTS + 1; j++) { test_arch_events(v, perf_caps[i], j, 0); test_arch_events(v, perf_caps[i], j, 0xff); - for (k = 0; k < nr_arch_events; k++) + for (k = 0; k < NR_INTEL_ARCH_EVENTS; k++) test_arch_events(v, perf_caps[i], j, BIT(k)); } diff --git a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c index 916e04248fbb..917b6066cfc1 100644 --- a/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c +++ b/tools/testing/selftests/kvm/x86/svm_int_ctl_test.c @@ -42,10 +42,7 @@ static void l2_guest_code(struct svm_test_data *svm) x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | INTR_IRQ_NUMBER); - __asm__ __volatile__( - "sti\n" - "nop\n" - ); + sti_nop(); GUEST_ASSERT(vintr_irq_called); GUEST_ASSERT(intr_irq_called); diff --git a/tools/testing/selftests/kvm/x86/ucna_injection_test.c b/tools/testing/selftests/kvm/x86/ucna_injection_test.c index 57f157c06b39..1e5e564523b3 100644 --- a/tools/testing/selftests/kvm/x86/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86/ucna_injection_test.c @@ -86,7 +86,7 @@ static void ucna_injection_guest_code(void) wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN); /* Enables interrupt in guest. */ - asm volatile("sti"); + sti(); /* Let user space inject the first UCNA */ GUEST_SYNC(SYNC_FIRST_UCNA); diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c index a76078a08ff8..35cb9de54a82 100644 --- a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c @@ -106,7 +106,8 @@ static void halter_guest_code(struct test_data_page *data) data->halter_tpr = xapic_read_reg(APIC_TASKPRI); data->halter_ppr = xapic_read_reg(APIC_PROCPRI); data->hlt_count++; - asm volatile("sti; hlt; cli"); + safe_halt(); + cli(); data->wake_count++; } } @@ -465,6 +466,19 @@ int main(int argc, char *argv[]) cancel_join_vcpu_thread(threads[0], params[0].vcpu); cancel_join_vcpu_thread(threads[1], params[1].vcpu); + /* + * If the host support Idle HLT, i.e. KVM *might* be using Idle HLT, + * then the number of HLT exits may be less than the number of HLTs + * that were executed, as Idle HLT elides the exit if the vCPU has an + * unmasked, pending IRQ (or NMI). + */ + if (this_cpu_has(X86_FEATURE_IDLE_HLT)) + TEST_ASSERT(data->hlt_count >= vcpu_get_stat(params[0].vcpu, halt_exits), + "HLT insns = %lu, HLT exits = %lu", + data->hlt_count, vcpu_get_stat(params[0].vcpu, halt_exits)); + else + TEST_ASSERT_EQ(data->hlt_count, vcpu_get_stat(params[0].vcpu, halt_exits)); + fprintf(stderr, "Test successful after running for %d seconds.\n" "Sending vCPU sent %lu IPIs to halting vCPU\n" diff --git a/tools/testing/selftests/kvm/x86/xapic_state_test.c b/tools/testing/selftests/kvm/x86/xapic_state_test.c index 88bcca188799..fdebff1165c7 100644 --- a/tools/testing/selftests/kvm/x86/xapic_state_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_state_test.c @@ -18,7 +18,7 @@ struct xapic_vcpu { static void xapic_guest_code(void) { - asm volatile("cli"); + cli(); xapic_enable(); @@ -38,7 +38,7 @@ static void xapic_guest_code(void) static void x2apic_guest_code(void) { - asm volatile("cli"); + cli(); x2apic_enable(); diff --git a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c index a59b3c799bb2..287829f850f7 100644 --- a/tools/testing/selftests/kvm/x86/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86/xen_shinfo_test.c @@ -191,10 +191,7 @@ static void guest_code(void) struct vcpu_runstate_info *rs = (void *)RUNSTATE_VADDR; int i; - __asm__ __volatile__( - "sti\n" - "nop\n" - ); + sti_nop(); /* Trigger an interrupt injection */ GUEST_SYNC(TEST_INJECT_VECTOR); diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index d6edcfcb5be8..530390033929 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -228,4 +228,7 @@ $(OUTPUT)/%:%.S $(LINK.S) $^ $(LDLIBS) -o $@ endif -.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir +headers: + $(Q)$(MAKE) -C $(top_srcdir) headers + +.PHONY: run_tests all clean install emit_tests gen_mods_dir clean_mods_dir headers diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile index c52fe3ad8e98..f876bf4744e1 100644 --- a/tools/testing/selftests/lib/Makefile +++ b/tools/testing/selftests/lib/Makefile @@ -4,5 +4,5 @@ # No binaries, but make sure arg-less "make" doesn't trigger "run_tests" all: -TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh scanf.sh +TEST_PROGS := bitmap.sh include ../lib.mk diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config index dc15aba8d0a3..81a1f64a22e8 100644 --- a/tools/testing/selftests/lib/config +++ b/tools/testing/selftests/lib/config @@ -1,5 +1,2 @@ -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m -CONFIG_PRIME_NUMBERS=m CONFIG_TEST_BITOPS=m diff --git a/tools/testing/selftests/lib/prime_numbers.sh b/tools/testing/selftests/lib/prime_numbers.sh deleted file mode 100755 index 370b79a9cb2e..000000000000 --- a/tools/testing/selftests/lib/prime_numbers.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Checks fast/slow prime_number generation for inconsistencies -$(dirname $0)/../kselftest/module.sh "prime numbers" prime_numbers selftest=65536 diff --git a/tools/testing/selftests/lib/printf.sh b/tools/testing/selftests/lib/printf.sh deleted file mode 100755 index 05f4544e87f9..000000000000 --- a/tools/testing/selftests/lib/printf.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Tests the printf infrastructure using test_printf kernel module. -$(dirname $0)/../kselftest/module.sh "printf" test_printf diff --git a/tools/testing/selftests/lib/scanf.sh b/tools/testing/selftests/lib/scanf.sh deleted file mode 100755 index b59b8ba561c3..000000000000 --- a/tools/testing/selftests/lib/scanf.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Tests the scanf infrastructure using test_scanf kernel module. -$(dirname $0)/../kselftest/module.sh "scanf" test_scanf diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c index ece37212a8a2..525c50d3ec23 100644 --- a/tools/testing/selftests/mm/guard-pages.c +++ b/tools/testing/selftests/mm/guard-pages.c @@ -19,6 +19,8 @@ #include <sys/uio.h> #include <unistd.h> +#include "../pidfd/pidfd.h" + /* * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1: * @@ -50,11 +52,6 @@ static void handle_fatal(int c) siglongjmp(signal_jmp_buf, c); } -static int pidfd_open(pid_t pid, unsigned int flags) -{ - return syscall(SYS_pidfd_open, pid, flags); -} - static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, size_t n, int advice, unsigned int flags) { @@ -370,14 +367,10 @@ TEST_F(guard_pages, multi_vma) TEST_F(guard_pages, process_madvise) { const unsigned long page_size = self->page_size; - pid_t pid = getpid(); - int pidfd = pidfd_open(pid, 0); char *ptr_region, *ptr1, *ptr2, *ptr3; ssize_t count; struct iovec vec[6]; - ASSERT_NE(pidfd, -1); - /* Reserve region to map over. */ ptr_region = mmap(NULL, 100 * page_size, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0); @@ -425,7 +418,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); /* Now guard in one step. */ - count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_INSTALL, 0); /* OK we don't have permission to do this, skip. */ if (count == -1 && errno == EPERM) @@ -446,7 +439,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); /* Now do the same with unguard... */ - count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + count = sys_process_madvise(PIDFD_SELF, vec, 6, MADV_GUARD_REMOVE, 0); /* ...and everything should now succeed. */ @@ -463,7 +456,6 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(ptr1, 10 * page_size), 0); ASSERT_EQ(munmap(ptr2, 5 * page_size), 0); ASSERT_EQ(munmap(ptr3, 20 * page_size), 0); - close(pidfd); } /* Assert that unmapping ranges does not leave guard markers behind. */ diff --git a/tools/testing/selftests/mm/mseal_test.c b/tools/testing/selftests/mm/mseal_test.c index ad17005521a8..005f29c86484 100644 --- a/tools/testing/selftests/mm/mseal_test.c +++ b/tools/testing/selftests/mm/mseal_test.c @@ -218,7 +218,7 @@ bool seal_support(void) bool pkey_supported(void) { #if defined(__i386__) || defined(__x86_64__) /* arch */ - int pkey = sys_pkey_alloc(0, 0); + int pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); if (pkey > 0) return true; @@ -1671,7 +1671,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) setup_single_address_rw(size, &ptr); FAIL_TEST_IF_FALSE(ptr != (void *)-1); - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_TEST_IF_FALSE(pkey > 0); ret = sys_mprotect_pkey((void *)ptr, size, PROT_READ | PROT_WRITE, pkey); @@ -1683,7 +1683,7 @@ static void test_seal_discard_ro_anon_on_pkey(bool seal) } /* sealing doesn't take effect if PKRU allow write. */ - set_pkey(pkey, 0); + set_pkey(pkey, PKEY_UNRESTRICTED); ret = sys_madvise(ptr, size, MADV_DONTNEED); FAIL_TEST_IF_FALSE(!ret); diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index f080e97b39be..ea404f80e6cb 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -13,6 +13,7 @@ #include <ucontext.h> #include <sys/mman.h> +#include <linux/mman.h> #include <linux/types.h> #include "../kselftest.h" @@ -193,7 +194,7 @@ static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) static inline int kernel_has_pkeys(void) { /* try allocating a key and see if it succeeds */ - int ret = sys_pkey_alloc(0, 0); + int ret = sys_pkey_alloc(0, PKEY_UNRESTRICTED); if (ret <= 0) { return 0; } diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index 1ac8c8809880..b5e076a564c9 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -311,7 +311,7 @@ static void test_sigsegv_handler_with_different_pkey_for_stack(void) __write_pkey_reg(pkey_reg); /* Protect the new stack with MPK 1 */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ @@ -484,7 +484,7 @@ static void test_pkru_sigreturn(void) __write_pkey_reg(pkey_reg); /* Protect the stack with MPK 2 */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); sys_mprotect_pkey(stack, STACK_SIZE, PROT_READ | PROT_WRITE, pkey); /* Set up alternate signal stack that will use the default MPK */ diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 35565af308af..23ebec367015 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -463,7 +463,7 @@ static pid_t fork_lazy_child(void) static int alloc_pkey(void) { int ret; - unsigned long init_val = 0x0; + unsigned long init_val = PKEY_UNRESTRICTED; dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); diff --git a/tools/testing/selftests/mount_setattr/mount_setattr_test.c b/tools/testing/selftests/mount_setattr/mount_setattr_test.c index 70f65eb320a7..48a000cabc97 100644 --- a/tools/testing/selftests/mount_setattr/mount_setattr_test.c +++ b/tools/testing/selftests/mount_setattr/mount_setattr_test.c @@ -20,6 +20,7 @@ #include <stdarg.h> #include <linux/mount.h> +#include "../filesystems/overlayfs/wrappers.h" #include "../kselftest_harness.h" #ifndef CLONE_NEWNS @@ -126,6 +127,26 @@ #endif #endif +#ifndef __NR_move_mount + #if defined __alpha__ + #define __NR_move_mount 539 + #elif defined _MIPS_SIM + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ + #define __NR_move_mount 4429 + #endif + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ + #define __NR_move_mount 6429 + #endif + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ + #define __NR_move_mount 5429 + #endif + #elif defined __ia64__ + #define __NR_move_mount (428 + 1024) + #else + #define __NR_move_mount 429 + #endif +#endif + #ifndef MOUNT_ATTR_IDMAP #define MOUNT_ATTR_IDMAP 0x00100000 #endif @@ -397,6 +418,10 @@ FIXTURE_SETUP(mount_setattr) ASSERT_EQ(mkdir("/tmp/B/BB", 0777), 0); + ASSERT_EQ(mkdir("/tmp/target1", 0777), 0); + + ASSERT_EQ(mkdir("/tmp/target2", 0777), 0); + ASSERT_EQ(mount("testing", "/tmp/B/BB", "tmpfs", MS_NOATIME | MS_NODEV, "size=100000,mode=700"), 0); @@ -1506,4 +1531,631 @@ TEST_F(mount_setattr, mount_attr_nosymfollow) ASSERT_EQ(close(fd), 0); } +TEST_F(mount_setattr, open_tree_detached) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /AA testing tmpfs + * `-/AA/B testing tmpfs + * `-/AA/B/BB testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_subdir, "B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(move_mount(fd_tree_subdir, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + /* + * /tmp/target1 testing tmpfs + * `-/tmp/target1/B testing tmpfs + * `-/tmp/target1/B/BB testing tmpfs + */ + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target1/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target1/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target2", MOVE_MOUNT_F_EMPTY_PATH), 0); + /* + * /tmp/target2 testing tmpfs + * |-/tmp/target2/A testing tmpfs + * | `-/tmp/target2/A/AA testing tmpfs + * | `-/tmp/target2/A/AA/B testing tmpfs + * | `-/tmp/target2/A/AA/B/BB testing tmpfs + * `-/tmp/target2/B testing ramfs + */ + ASSERT_EQ(statx(-EBADF, "/tmp/target2", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(-EBADF, "/tmp/target2/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, open_tree_detached_fail) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + + /* + * The origin mount namespace of the anonymous mount namespace + * of @fd_tree_base doesn't match the caller's mount namespace + * anymore so creation of another detached mounts must fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EINVAL); +} + +TEST_F(mount_setattr, open_tree_detached_fail2) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(create_and_enter_userns(), 0); + + /* + * The caller entered a new user namespace. They will have + * CAP_SYS_ADMIN in this user namespace. However, they're still + * located in a mount namespace that is owned by an ancestor + * user namespace in which they hold no privilege. Creating a + * detached mount must thus fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EPERM); +} + +TEST_F(mount_setattr, open_tree_detached_fail3) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_EQ(statx(fd_tree_base, "A/AA/B/BB", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + EXPECT_EQ(prepare_unpriv_mountns(), 0); + + /* + * The caller entered a new mount namespace. They will have + * CAP_SYS_ADMIN in the owning user namespace of their mount + * namespace. + * + * However, the origin mount namespace of the anonymous mount + * namespace of @fd_tree_base doesn't match the caller's mount + * namespace anymore so creation of another detached mounts must + * fail. + */ + fd_tree_subdir = sys_open_tree(fd_tree_base, "A/AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_LT(fd_tree_subdir, 0); + ASSERT_EQ(errno, EINVAL); +} + +TEST_F(mount_setattr, open_tree_subfolder) +{ + int fd_context, fd_tmpfs, fd_tree; + + fd_context = sys_fsopen("tmpfs", 0); + ASSERT_GE(fd_context, 0); + + ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0); + + fd_tmpfs = sys_fsmount(fd_context, 0, 0); + ASSERT_GE(fd_tmpfs, 0); + + EXPECT_EQ(close(fd_context), 0); + + ASSERT_EQ(mkdirat(fd_tmpfs, "subdir", 0755), 0); + + fd_tree = sys_open_tree(fd_tmpfs, "subdir", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + EXPECT_EQ(close(fd_tmpfs), 0); + + ASSERT_EQ(mkdirat(-EBADF, "/mnt/open_tree_subfolder", 0755), 0); + + ASSERT_EQ(sys_move_mount(fd_tree, "", -EBADF, "/mnt/open_tree_subfolder", MOVE_MOUNT_F_EMPTY_PATH), 0); + + EXPECT_EQ(close(fd_tree), 0); + + ASSERT_EQ(umount2("/mnt/open_tree_subfolder", 0), 0); + + EXPECT_EQ(rmdir("/mnt/open_tree_subfolder"), 0); +} + +TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_then_close) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* + * /mnt testing tmpfs + * `-/mnt testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(statx(fd_tree_subdir, "", AT_EMPTY_PATH, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_NE(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, mount_detached_mount_on_detached_mount_and_attach) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + __u64 mnt_id = 0; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(fd_tree_base, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + /* + * /mnt testing tmpfs + */ + ASSERT_EQ(statx(fd_tree_subdir, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* + * /mnt testing tmpfs + * `-/mnt testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(statx(fd_tree_subdir, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_TRUE(stx.stx_mask & STATX_MNT_ID_UNIQUE); + mnt_id = stx.stx_mnt_id; + + ASSERT_NE(move_mount(fd_tree_subdir, "", fd_tree_base, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, STATX_MNT_ID_UNIQUE, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + ASSERT_TRUE(stx.stx_mask & STATX_MNT_ID_UNIQUE); + ASSERT_EQ(stx.stx_mnt_id, mnt_id); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, move_mount_detached_fail) +{ + int fd_tree_base = -EBADF, fd_tree_subdir = -EBADF; + struct statx stx; + + fd_tree_base = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_base, 0); + + /* Attach the mount to the caller's mount namespace. */ + ASSERT_EQ(move_mount(fd_tree_base, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(statx(fd_tree_base, "A", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + fd_tree_subdir = sys_open_tree(-EBADF, "/tmp/B", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE); + ASSERT_GE(fd_tree_subdir, 0); + ASSERT_EQ(statx(fd_tree_subdir, "BB", 0, 0, &stx), 0); + ASSERT_FALSE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* Not allowed to move an attached mount to a detached mount. */ + ASSERT_NE(move_mount(fd_tree_base, "", fd_tree_subdir, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + ASSERT_EQ(errno, EINVAL); + + EXPECT_EQ(close(fd_tree_base), 0); + EXPECT_EQ(close(fd_tree_subdir), 0); +} + +TEST_F(mount_setattr, attach_detached_mount_then_umount_then_close) +{ + int fd_tree = -EBADF; + struct statx stx; + + fd_tree = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx), 0); + /* We copied with AT_RECURSIVE so /mnt/A must be a mountpoint. */ + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + /* Attach the mount to the caller's mount namespace. */ + ASSERT_EQ(move_mount(fd_tree, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(statx(-EBADF, "/tmp/target1", 0, 0, &stx), 0); + ASSERT_TRUE(stx.stx_attributes & STATX_ATTR_MOUNT_ROOT); + + ASSERT_EQ(umount2("/tmp/target1", MNT_DETACH), 0); + + /* + * This tests whether dissolve_on_fput() handles a NULL mount + * namespace correctly, i.e., that it doesn't splat. + */ + EXPECT_EQ(close(fd_tree), 0); +} + +TEST_F(mount_setattr, mount_detached1_onto_detached2_then_close_detached1_then_mount_detached2_onto_attached) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * `-/mnt/B testing ramfs + */ + fd_tree2 = sys_open_tree(-EBADF, "/mnt/B", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree2, 0); + + /* + * Move the source detached mount tree to the target detached + * mount tree. This will move all the mounts in the source mount + * tree from the source anonymous mount namespace to the target + * anonymous mount namespace. + * + * The source detached mount tree and the target detached mount + * tree now both refer to the same anonymous mount namespace. + * + * |-"" testing ramfs + * `-"" testing tmpfs + * `-""/AA testing tmpfs + * `-""/AA/B testing tmpfs + * `-""/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree1, "", fd_tree2, "", MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); + + /* + * The source detached mount tree @fd_tree1 is now an attached + * mount, i.e., it has a parent. Specifically, it now has the + * root mount of the mount tree of @fd_tree2 as its parent. + * + * That means we are no longer allowed to attach it as we only + * allow attaching the root of an anonymous mount tree, not + * random bits and pieces. Verify that the kernel enforces this. + */ + ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + /* + * Closing the source detached mount tree must not unmount and + * free the shared anonymous mount namespace. The kernel will + * quickly yell at us because the anonymous mount namespace + * won't be empty when it's freed. + */ + EXPECT_EQ(close(fd_tree1), 0); + + /* + * Attach the mount tree to a non-anonymous mount namespace. + * This can only succeed if closing fd_tree1 had proper + * semantics and didn't cause the anonymous mount namespace to + * be freed. If it did this will trigger a UAF which will be + * visible on any KASAN enabled kernel. + * + * |-/tmp/target1 testing ramfs + * `-/tmp/target1 testing tmpfs + * `-/tmp/target1/AA testing tmpfs + * `-/tmp/target1/AA/B testing tmpfs + * `-/tmp/target1/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + EXPECT_EQ(close(fd_tree2), 0); +} + +TEST_F(mount_setattr, two_detached_mounts_referring_to_same_anonymous_mount_namespace) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * Copy the following mount tree: + * + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * Create an O_PATH file descriptors with a separate struct file + * that refers to the same detached mount tree as @fd_tree1 + */ + fd_tree2 = sys_open_tree(fd_tree1, "", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree2, 0); + + /* + * Copy the following mount tree: + * + * |-/tmp/target1 testing tmpfs + * `-/tmp/target1/AA testing tmpfs + * `-/tmp/target1/AA/B testing tmpfs + * `-/tmp/target1/AA/B/BB testing tmpfs + */ + ASSERT_EQ(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + /* + * This must fail as this would mean adding the same mount tree + * into the same mount tree. + */ + ASSERT_NE(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); +} + +TEST_F(mount_setattr, two_detached_subtrees_of_same_anonymous_mount_namespace) +{ + int fd_tree1 = -EBADF, fd_tree2 = -EBADF; + + /* + * Copy the following mount tree: + * + * |-/mnt/A testing tmpfs + * `-/mnt/A/AA testing tmpfs + * `-/mnt/A/AA/B testing tmpfs + * `-/mnt/A/AA/B/BB testing tmpfs + */ + fd_tree1 = sys_open_tree(-EBADF, "/mnt/A", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree1, 0); + + /* + * Create an O_PATH file descriptors with a separate struct file that + * refers to a subtree of the same detached mount tree as @fd_tree1 + */ + fd_tree2 = sys_open_tree(fd_tree1, "AA", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_EMPTY_PATH | OPEN_TREE_CLOEXEC); + ASSERT_GE(fd_tree2, 0); + + /* + * This must fail as it is only possible to attach the root of a + * detached mount tree. + */ + ASSERT_NE(move_mount(fd_tree2, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); + + ASSERT_EQ(move_mount(fd_tree1, "", -EBADF, "/tmp/target1", MOVE_MOUNT_F_EMPTY_PATH), 0); +} + +TEST_F(mount_setattr, detached_tree_propagation) +{ + int fd_tree = -EBADF; + struct statx stx1, stx2, stx3, stx4; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + ASSERT_EQ(mount(NULL, "/mnt", NULL, MS_REC | MS_SHARED, NULL), 0); + + /* + * Copy the following mount tree: + * + * /mnt testing tmpfs + * |-/mnt/A testing tmpfs + * | `-/mnt/A/AA testing tmpfs + * | `-/mnt/A/AA/B testing tmpfs + * | `-/mnt/A/AA/B/BB testing tmpfs + * `-/mnt/B testing ramfs + */ + fd_tree = sys_open_tree(-EBADF, "/mnt", + AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | + AT_RECURSIVE | OPEN_TREE_CLOEXEC | + OPEN_TREE_CLONE); + ASSERT_GE(fd_tree, 0); + + ASSERT_EQ(statx(-EBADF, "/mnt/A", 0, 0, &stx1), 0); + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx2), 0); + + /* + * Copying the mount namespace like done above doesn't alter the + * mounts in any way so the filesystem mounted on /mnt must be + * identical even though the mounts will differ. Use the device + * information to verify that. Note that tmpfs will have a 0 + * major number so comparing the major number is misleading. + */ + ASSERT_EQ(stx1.stx_dev_minor, stx2.stx_dev_minor); + + /* Mount a tmpfs filesystem over /mnt/A. */ + ASSERT_EQ(mount(NULL, "/mnt/A", "tmpfs", 0, NULL), 0); + + + ASSERT_EQ(statx(-EBADF, "/mnt/A", 0, 0, &stx3), 0); + ASSERT_EQ(statx(fd_tree, "A", 0, 0, &stx4), 0); + + /* + * A new filesystem has been mounted on top of /mnt/A which + * means that the device information will be different for any + * statx() that was taken from /mnt/A before the mount compared + * to one after the mount. + * + * Since we already now that the device information between the + * stx1 and stx2 samples are identical we also now that stx2 and + * stx3 device information will necessarily differ. + */ + ASSERT_NE(stx1.stx_dev_minor, stx3.stx_dev_minor); + + /* + * If mount propagation worked correctly then the tmpfs mount + * that was created after the mount namespace was unshared will + * have propagated onto /mnt/A in the detached mount tree. + * + * Verify that the device information for stx3 and stx4 are + * identical. It is already established that stx3 is different + * from both stx1 and stx2 sampled before the tmpfs mount was + * done so if stx3 and stx4 are identical the proof is done. + */ + ASSERT_EQ(stx3.stx_dev_minor, stx4.stx_dev_minor); + + EXPECT_EQ(close(fd_tree), 0); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 28a715a8ef2b..679542f565a4 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -21,6 +21,7 @@ msg_oob msg_zerocopy netlink-dumps nettest +proc_net_pktgen psock_fanout psock_snd psock_tpacket @@ -42,6 +43,7 @@ socket so_incoming_cpu so_netns_cookie so_txtime +so_rcv_listener stress_reuseport_listen tap tcp_fastopen_backup_key diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 8f32b4f01aee..6d718b478ed8 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -7,7 +7,7 @@ CFLAGS += -I../../../../usr/include/ $(KHDR_INCLUDES) CFLAGS += -I../ TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \ - rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh + rtnetlink.sh xfrm_policy.sh TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh @@ -33,9 +33,12 @@ TEST_PROGS += gro.sh TEST_PROGS += gre_gso.sh TEST_PROGS += cmsg_so_mark.sh TEST_PROGS += cmsg_so_priority.sh -TEST_PROGS += cmsg_time.sh cmsg_ipv6.sh +TEST_PROGS += test_so_rcv.sh +TEST_PROGS += cmsg_time.sh cmsg_ip.sh TEST_PROGS += netns-name.sh +TEST_PROGS += link_netns.py TEST_PROGS += nl_netdev.py +TEST_PROGS += rtnetlink.py TEST_PROGS += srv6_end_dt46_l3vpn_test.sh TEST_PROGS += srv6_end_dt4_l3vpn_test.sh TEST_PROGS += srv6_end_dt6_l3vpn_test.sh @@ -75,6 +78,7 @@ TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls tun tap epoll_busy_ TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen +TEST_GEN_FILES += so_rcv_listener TEST_PROGS += test_vxlan_vnifiltering.sh TEST_GEN_FILES += io_uring_zerocopy_tx TEST_PROGS += io_uring_zerocopy_tx.sh @@ -100,6 +104,7 @@ TEST_PROGS += vlan_bridge_binding.sh TEST_PROGS += bpf_offload.py TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_PROGS += busy_poll_test.sh +TEST_GEN_PROGS += proc_net_pktgen TEST_PROGS += lwt_dst_cache_ref_loop.sh # YNL files, must be before "include ..lib.mk" diff --git a/tools/testing/selftests/net/bpf_offload.py b/tools/testing/selftests/net/bpf_offload.py index fd0d959914e4..b2c271b79240 100755 --- a/tools/testing/selftests/net/bpf_offload.py +++ b/tools/testing/selftests/net/bpf_offload.py @@ -207,9 +207,11 @@ def bpftool_prog_list_wait(expected=0, n_retry=20): raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs)) def bpftool_map_list_wait(expected=0, n_retry=20, ns=""): + nmaps = None for i in range(n_retry): maps = bpftool_map_list(ns=ns) - if len(maps) == expected: + nmaps = len(maps) + if nmaps == expected: return maps time.sleep(0.05) raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps)) @@ -710,6 +712,7 @@ _, base_maps = bpftool("map") base_map_names = [ 'pid_iter.rodata', # created on each bpftool invocation 'libbpf_det_bind', # created on each bpftool invocation + 'libbpf_global', ] # Check netdevsim diff --git a/tools/testing/selftests/net/cmsg_ip.sh b/tools/testing/selftests/net/cmsg_ip.sh new file mode 100755 index 000000000000..b55680e081ad --- /dev/null +++ b/tools/testing/selftests/net/cmsg_ip.sh @@ -0,0 +1,187 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +IP4=172.16.0.1/24 +TGT4=172.16.0.2 +IP6=2001:db8:1::1/64 +TGT6=2001:db8:1::2 +TMPF=$(mktemp --suffix ".pcap") + +cleanup() +{ + rm -f $TMPF + cleanup_ns $NS +} + +trap cleanup EXIT + +tcpdump -h | grep immediate-mode >> /dev/null +if [ $? -ne 0 ]; then + echo "SKIP - tcpdump with --immediate-mode option required" + exit $ksft_skip +fi + +# Namespaces +setup_ns NS +NSEXE="ip netns exec $NS" + +$NSEXE sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null + +# Connectivity +ip -netns $NS link add type dummy +ip -netns $NS link set dev dummy0 up +ip -netns $NS addr add $IP4 dev dummy0 +ip -netns $NS addr add $IP6 dev dummy0 + +# Test +BAD=0 +TOTAL=0 + +check_result() { + ((TOTAL++)) + if [ $1 -ne $2 ]; then + echo " Case $3 returned $1, expected $2" + ((BAD++)) + fi +} + +# IPV6_DONTFRAG +for ovr in setsock cmsg both diff; do + for df in 0 1; do + for p in u U i r; do + [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP + [ $p == "i" ] && prot=ICMP + [ $p == "r" ] && prot=RAW + + [ $ovr == "setsock" ] && m="-F $df" + [ $ovr == "cmsg" ] && m="-f $df" + [ $ovr == "both" ] && m="-F $df -f $df" + [ $ovr == "diff" ] && m="-F $((1 - df)) -f $df" + + $NSEXE ./cmsg_sender -s -S 2000 -6 -p $p $m $TGT6 1234 + check_result $? $df "DONTFRAG $prot $ovr" + done + done +done + +# IP_TOS + IPV6_TCLASS + +test_dscp() { + local -r IPVER=$1 + local -r TGT=$2 + local -r MATCH=$3 + + local -r TOS=0x10 + local -r TOS2=0x20 + local -r ECN=0x3 + + ip $IPVER -netns $NS rule add tos $TOS lookup 300 + ip $IPVER -netns $NS route add table 300 prohibit any + + for ovr in setsock cmsg both diff; do + for p in u U i r; do + [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP + [ $p == "i" ] && prot=ICMP + [ $p == "r" ] && prot=RAW + + [ $ovr == "setsock" ] && m="-C" + [ $ovr == "cmsg" ] && m="-c" + [ $ovr == "both" ] && m="-C $((TOS2)) -c" + [ $ovr == "diff" ] && m="-C $((TOS )) -c" + + $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & + BG=$! + sleep 0.05 + + $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS2)) $TGT 1234 + check_result $? 0 "$MATCH $prot $ovr - pass" + + while [ -d /proc/$BG ]; do + $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS2)) $TGT 1234 + done + + tcpdump -r $TMPF -v 2>&1 | grep "$MATCH $TOS2" >> /dev/null + check_result $? 0 "$MATCH $prot $ovr - packet data" + rm $TMPF + + [ $ovr == "both" ] && m="-C $((TOS )) -c" + [ $ovr == "diff" ] && m="-C $((TOS2)) -c" + + # Match prohibit rule: expect failure + $NSEXE ./cmsg_sender $IPVER -p $p $m $((TOS)) -s $TGT 1234 + check_result $? 1 "$MATCH $prot $ovr - rejection" + + # Match prohibit rule: IPv4 masks ECN: expect failure + if [[ "$IPVER" == "-4" ]]; then + $NSEXE ./cmsg_sender $IPVER -p $p $m "$((TOS | ECN))" -s $TGT 1234 + check_result $? 1 "$MATCH $prot $ovr - rejection (ECN)" + fi + done + done +} + +test_dscp -4 $TGT4 tos +test_dscp -6 $TGT6 class + +# IP_TTL + IPV6_HOPLIMIT +test_ttl_hoplimit() { + local -r IPVER=$1 + local -r TGT=$2 + local -r MATCH=$3 + + local -r LIM=4 + + for ovr in setsock cmsg both diff; do + for p in u U i r; do + [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP + [ $p == "i" ] && prot=ICMP + [ $p == "r" ] && prot=RAW + + [ $ovr == "setsock" ] && m="-L" + [ $ovr == "cmsg" ] && m="-l" + [ $ovr == "both" ] && m="-L $LIM -l" + [ $ovr == "diff" ] && m="-L $((LIM + 1)) -l" + + $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & + BG=$! + sleep 0.05 + + $NSEXE ./cmsg_sender $IPVER -p $p $m $LIM $TGT 1234 + check_result $? 0 "$MATCH $prot $ovr - pass" + + while [ -d /proc/$BG ]; do + $NSEXE ./cmsg_sender $IPVER -p $p $m $LIM $TGT 1234 + done + + tcpdump -r $TMPF -v 2>&1 | grep "$MATCH $LIM[^0-9]" >> /dev/null + check_result $? 0 "$MATCH $prot $ovr - packet data" + rm $TMPF + done + done +} + +test_ttl_hoplimit -4 $TGT4 ttl +test_ttl_hoplimit -6 $TGT6 hlim + +# IPV6 exthdr +for p in u U i r; do + # Very basic "does it crash" test + for h in h d r; do + $NSEXE ./cmsg_sender -p $p -6 -H $h $TGT6 1234 + check_result $? 0 "ExtHdr $prot $ovr - pass" + done +done + +# Summary +if [ $BAD -ne 0 ]; then + echo "FAIL - $BAD/$TOTAL cases failed" + exit 1 +else + echo "OK" + exit 0 +fi diff --git a/tools/testing/selftests/net/cmsg_ipv6.sh b/tools/testing/selftests/net/cmsg_ipv6.sh deleted file mode 100755 index 8bc23fb4c82b..000000000000 --- a/tools/testing/selftests/net/cmsg_ipv6.sh +++ /dev/null @@ -1,154 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -source lib.sh - -IP6=2001:db8:1::1/64 -TGT6=2001:db8:1::2 -TMPF=$(mktemp --suffix ".pcap") - -cleanup() -{ - rm -f $TMPF - cleanup_ns $NS -} - -trap cleanup EXIT - -tcpdump -h | grep immediate-mode >> /dev/null -if [ $? -ne 0 ]; then - echo "SKIP - tcpdump with --immediate-mode option required" - exit $ksft_skip -fi - -# Namespaces -setup_ns NS -NSEXE="ip netns exec $NS" - -$NSEXE sysctl -w net.ipv4.ping_group_range='0 2147483647' > /dev/null - -# Connectivity -ip -netns $NS link add type dummy -ip -netns $NS link set dev dummy0 up -ip -netns $NS addr add $IP6 dev dummy0 - -# Test -BAD=0 -TOTAL=0 - -check_result() { - ((TOTAL++)) - if [ $1 -ne $2 ]; then - echo " Case $3 returned $1, expected $2" - ((BAD++)) - fi -} - -# IPV6_DONTFRAG -for ovr in setsock cmsg both diff; do - for df in 0 1; do - for p in u i r; do - [ $p == "u" ] && prot=UDP - [ $p == "i" ] && prot=ICMP - [ $p == "r" ] && prot=RAW - - [ $ovr == "setsock" ] && m="-F $df" - [ $ovr == "cmsg" ] && m="-f $df" - [ $ovr == "both" ] && m="-F $df -f $df" - [ $ovr == "diff" ] && m="-F $((1 - df)) -f $df" - - $NSEXE ./cmsg_sender -s -S 2000 -6 -p $p $m $TGT6 1234 - check_result $? $df "DONTFRAG $prot $ovr" - done - done -done - -# IPV6_TCLASS -TOS=0x10 -TOS2=0x20 - -ip -6 -netns $NS rule add tos $TOS lookup 300 -ip -6 -netns $NS route add table 300 prohibit any - -for ovr in setsock cmsg both diff; do - for p in u i r; do - [ $p == "u" ] && prot=UDP - [ $p == "i" ] && prot=ICMP - [ $p == "r" ] && prot=RAW - - [ $ovr == "setsock" ] && m="-C" - [ $ovr == "cmsg" ] && m="-c" - [ $ovr == "both" ] && m="-C $((TOS2)) -c" - [ $ovr == "diff" ] && m="-C $((TOS )) -c" - - $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & - BG=$! - sleep 0.05 - - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 - check_result $? 0 "TCLASS $prot $ovr - pass" - - while [ -d /proc/$BG ]; do - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS2)) $TGT6 1234 - done - - tcpdump -r $TMPF -v 2>&1 | grep "class $TOS2" >> /dev/null - check_result $? 0 "TCLASS $prot $ovr - packet data" - rm $TMPF - - [ $ovr == "both" ] && m="-C $((TOS )) -c" - [ $ovr == "diff" ] && m="-C $((TOS2)) -c" - - $NSEXE ./cmsg_sender -6 -p $p $m $((TOS)) -s $TGT6 1234 - check_result $? 1 "TCLASS $prot $ovr - rejection" - done -done - -# IPV6_HOPLIMIT -LIM=4 - -for ovr in setsock cmsg both diff; do - for p in u i r; do - [ $p == "u" ] && prot=UDP - [ $p == "i" ] && prot=ICMP - [ $p == "r" ] && prot=RAW - - [ $ovr == "setsock" ] && m="-L" - [ $ovr == "cmsg" ] && m="-l" - [ $ovr == "both" ] && m="-L $LIM -l" - [ $ovr == "diff" ] && m="-L $((LIM + 1)) -l" - - $NSEXE nohup tcpdump --immediate-mode -p -ni dummy0 -w $TMPF -c 4 2> /dev/null & - BG=$! - sleep 0.05 - - $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 - check_result $? 0 "HOPLIMIT $prot $ovr - pass" - - while [ -d /proc/$BG ]; do - $NSEXE ./cmsg_sender -6 -p $p $m $LIM $TGT6 1234 - done - - tcpdump -r $TMPF -v 2>&1 | grep "hlim $LIM[^0-9]" >> /dev/null - check_result $? 0 "HOPLIMIT $prot $ovr - packet data" - rm $TMPF - done -done - -# IPV6 exthdr -for p in u i r; do - # Very basic "does it crash" test - for h in h d r; do - $NSEXE ./cmsg_sender -p $p -6 -H $h $TGT6 1234 - check_result $? 0 "ExtHdr $prot $ovr - pass" - done -done - -# Summary -if [ $BAD -ne 0 ]; then - echo "FAIL - $BAD/$TOTAL cases failed" - exit 1 -else - echo "OK" - exit 0 -fi diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index bc314382e4e1..a825e628aee7 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -33,6 +33,7 @@ enum { ERN_RECVERR, ERN_CMSG_RD, ERN_CMSG_RCV, + ERN_SEND_MORE, }; struct option_cmsg_u32 { @@ -46,6 +47,7 @@ struct options { const char *service; unsigned int size; unsigned int num_pkt; + bool msg_more; struct { unsigned int mark; unsigned int dontfrag; @@ -72,7 +74,7 @@ struct options { struct option_cmsg_u32 tclass; struct option_cmsg_u32 hlimit; struct option_cmsg_u32 exthdr; - } v6; + } cmsg; } opt = { .size = 13, .num_pkt = 1, @@ -94,7 +96,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-S send() size\n" "\t\t-4/-6 Force IPv4 / IPv6 only\n" "\t\t-p prot Socket protocol\n" - "\t\t (u = UDP (default); i = ICMP; r = RAW)\n" + "\t\t (u = UDP (default); i = ICMP; r = RAW;\n" + "\t\t U = UDP with MSG_MORE)\n" "\n" "\t\t-m val Set SO_MARK with given value\n" "\t\t-M val Set SO_MARK via setsockopt\n" @@ -104,13 +107,13 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-t Enable time stamp reporting\n" "\t\t-f val Set don't fragment via cmsg\n" "\t\t-F val Set don't fragment via setsockopt\n" - "\t\t-c val Set TCLASS via cmsg\n" - "\t\t-C val Set TCLASS via setsockopt\n" - "\t\t-l val Set HOPLIMIT via cmsg\n" - "\t\t-L val Set HOPLIMIT via setsockopt\n" + "\t\t-c val Set TOS/TCLASS via cmsg\n" + "\t\t-C val Set TOS/TCLASS via setsockopt\n" + "\t\t-l val Set TTL/HOPLIMIT via cmsg\n" + "\t\t-L val Set TTL/HOPLIMIT via setsockopt\n" "\t\t-H type Add an IPv6 header option\n" - "\t\t (h = HOP; d = DST; r = RTDST)" - ""); + "\t\t (h = HOP; d = DST; r = RTDST)\n" + "\n"); exit(ERN_HELP); } @@ -133,8 +136,11 @@ static void cs_parse_args(int argc, char *argv[]) opt.sock.family = AF_INET6; break; case 'p': - if (*optarg == 'u' || *optarg == 'U') { + if (*optarg == 'u') { opt.sock.proto = IPPROTO_UDP; + } else if (*optarg == 'U') { + opt.sock.proto = IPPROTO_UDP; + opt.msg_more = true; } else if (*optarg == 'i' || *optarg == 'I') { opt.sock.proto = IPPROTO_ICMP; } else if (*optarg == 'r') { @@ -169,37 +175,37 @@ static void cs_parse_args(int argc, char *argv[]) opt.ts.ena = true; break; case 'f': - opt.v6.dontfrag.ena = true; - opt.v6.dontfrag.val = atoi(optarg); + opt.cmsg.dontfrag.ena = true; + opt.cmsg.dontfrag.val = atoi(optarg); break; case 'F': opt.sockopt.dontfrag = atoi(optarg); break; case 'c': - opt.v6.tclass.ena = true; - opt.v6.tclass.val = atoi(optarg); + opt.cmsg.tclass.ena = true; + opt.cmsg.tclass.val = atoi(optarg); break; case 'C': opt.sockopt.tclass = atoi(optarg); break; case 'l': - opt.v6.hlimit.ena = true; - opt.v6.hlimit.val = atoi(optarg); + opt.cmsg.hlimit.ena = true; + opt.cmsg.hlimit.val = atoi(optarg); break; case 'L': opt.sockopt.hlimit = atoi(optarg); break; case 'H': - opt.v6.exthdr.ena = true; + opt.cmsg.exthdr.ena = true; switch (optarg[0]) { case 'h': - opt.v6.exthdr.val = IPV6_HOPOPTS; + opt.cmsg.exthdr.val = IPV6_HOPOPTS; break; case 'd': - opt.v6.exthdr.val = IPV6_DSTOPTS; + opt.cmsg.exthdr.val = IPV6_DSTOPTS; break; case 'r': - opt.v6.exthdr.val = IPV6_RTHDRDSTOPTS; + opt.cmsg.exthdr.val = IPV6_RTHDRDSTOPTS; break; default: printf("Error: hdr type: %s\n", optarg); @@ -261,12 +267,20 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) SOL_SOCKET, SO_MARK, &opt.mark); ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, SOL_SOCKET, SO_PRIORITY, &opt.priority); - ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, - SOL_IPV6, IPV6_DONTFRAG, &opt.v6.dontfrag); - ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, - SOL_IPV6, IPV6_TCLASS, &opt.v6.tclass); - ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, - SOL_IPV6, IPV6_HOPLIMIT, &opt.v6.hlimit); + + if (opt.sock.family == AF_INET) { + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IP, IP_TOS, &opt.cmsg.tclass); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IP, IP_TTL, &opt.cmsg.hlimit); + } else { + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IPV6, IPV6_DONTFRAG, &opt.cmsg.dontfrag); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IPV6, IPV6_TCLASS, &opt.cmsg.tclass); + ca_write_cmsg_u32(cbuf, cbuf_sz, &cmsg_len, + SOL_IPV6, IPV6_HOPLIMIT, &opt.cmsg.hlimit); + } if (opt.txtime.ena) { __u64 txtime; @@ -297,14 +311,14 @@ cs_write_cmsg(int fd, struct msghdr *msg, char *cbuf, size_t cbuf_sz) *(__u32 *)CMSG_DATA(cmsg) = SOF_TIMESTAMPING_TX_SCHED | SOF_TIMESTAMPING_TX_SOFTWARE; } - if (opt.v6.exthdr.ena) { + if (opt.cmsg.exthdr.ena) { cmsg = (struct cmsghdr *)(cbuf + cmsg_len); cmsg_len += CMSG_SPACE(8); if (cbuf_sz < cmsg_len) error(ERN_CMSG_WR, EFAULT, "cmsg buffer too small"); cmsg->cmsg_level = SOL_IPV6; - cmsg->cmsg_type = opt.v6.exthdr.val; + cmsg->cmsg_type = opt.cmsg.exthdr.val; cmsg->cmsg_len = CMSG_LEN(8); *(__u64 *)CMSG_DATA(cmsg) = 0; } @@ -405,23 +419,35 @@ static void ca_set_sockopts(int fd) setsockopt(fd, SOL_SOCKET, SO_MARK, &opt.sockopt.mark, sizeof(opt.sockopt.mark))) error(ERN_SOCKOPT, errno, "setsockopt SO_MARK"); - if (opt.sockopt.dontfrag && - setsockopt(fd, SOL_IPV6, IPV6_DONTFRAG, - &opt.sockopt.dontfrag, sizeof(opt.sockopt.dontfrag))) - error(ERN_SOCKOPT, errno, "setsockopt IPV6_DONTFRAG"); - if (opt.sockopt.tclass && - setsockopt(fd, SOL_IPV6, IPV6_TCLASS, - &opt.sockopt.tclass, sizeof(opt.sockopt.tclass))) - error(ERN_SOCKOPT, errno, "setsockopt IPV6_TCLASS"); - if (opt.sockopt.hlimit && - setsockopt(fd, SOL_IPV6, IPV6_UNICAST_HOPS, - &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit))) - error(ERN_SOCKOPT, errno, "setsockopt IPV6_HOPLIMIT"); if (opt.sockopt.priority && setsockopt(fd, SOL_SOCKET, SO_PRIORITY, &opt.sockopt.priority, sizeof(opt.sockopt.priority))) error(ERN_SOCKOPT, errno, "setsockopt SO_PRIORITY"); + if (opt.sock.family == AF_INET) { + if (opt.sockopt.tclass && + setsockopt(fd, SOL_IP, IP_TOS, + &opt.sockopt.tclass, sizeof(opt.sockopt.tclass))) + error(ERN_SOCKOPT, errno, "setsockopt IP_TOS"); + if (opt.sockopt.hlimit && + setsockopt(fd, SOL_IP, IP_TTL, + &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit))) + error(ERN_SOCKOPT, errno, "setsockopt IP_TTL"); + } else { + if (opt.sockopt.dontfrag && + setsockopt(fd, SOL_IPV6, IPV6_DONTFRAG, + &opt.sockopt.dontfrag, sizeof(opt.sockopt.dontfrag))) + error(ERN_SOCKOPT, errno, "setsockopt IPV6_DONTFRAG"); + if (opt.sockopt.tclass && + setsockopt(fd, SOL_IPV6, IPV6_TCLASS, + &opt.sockopt.tclass, sizeof(opt.sockopt.tclass))) + error(ERN_SOCKOPT, errno, "setsockopt IPV6_TCLASS"); + if (opt.sockopt.hlimit && + setsockopt(fd, SOL_IPV6, IPV6_UNICAST_HOPS, + &opt.sockopt.hlimit, sizeof(opt.sockopt.hlimit))) + error(ERN_SOCKOPT, errno, "setsockopt IPV6_HOPLIMIT"); + } + if (opt.txtime.ena) { struct sock_txtime so_txtime = { .clockid = CLOCK_MONOTONIC, @@ -511,7 +537,7 @@ int main(int argc, char *argv[]) cs_write_cmsg(fd, &msg, cbuf, sizeof(cbuf)); for (i = 0; i < opt.num_pkt; i++) { - err = sendmsg(fd, &msg, 0); + err = sendmsg(fd, &msg, opt.msg_more ? MSG_MORE : 0); if (err < 0) { if (!opt.silent_send) fprintf(stderr, "send failed: %s\n", strerror(errno)); @@ -522,6 +548,14 @@ int main(int argc, char *argv[]) err = ERN_SEND_SHORT; goto err_out; } + if (opt.msg_more) { + err = write(fd, NULL, 0); + if (err < 0) { + fprintf(stderr, "send more: %s\n", strerror(errno)); + err = ERN_SEND_MORE; + goto err_out; + } + } } err = ERN_SUCCESS; diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 61e5116987f3..130d532b7e67 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -18,6 +18,8 @@ CONFIG_DUMMY=y CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_BRIDGE=y CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_DEBUG_INFO_BTF=y +CONFIG_DEBUG_INFO_BTF_MODULES=n CONFIG_VLAN_8021Q=y CONFIG_GENEVE=m CONFIG_IFB=y @@ -107,5 +109,11 @@ CONFIG_XFRM_INTERFACE=m CONFIG_XFRM_USER=m CONFIG_IP_NF_MATCH_RPFILTER=m CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IPVLAN=m +CONFIG_CAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_VXCAN=m +CONFIG_NETKIT=y +CONFIG_NET_PKTGEN=m CONFIG_IPV6_ILA=m CONFIG_IPV6_RPL_LWTUNNEL=y diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index 899dbad0104b..4fcc38907e48 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -3667,7 +3667,7 @@ ipv6_addr_bind_novrf() # when it really should not a=${NSA_LO_IP6} log_start - show_hint "Tecnically should fail since address is not on device but kernel allows" + show_hint "Technically should fail since address is not on device but kernel allows" run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b log_test_addr ${a} $? 0 "TCP socket bind to out of scope local address" } @@ -3724,7 +3724,7 @@ ipv6_addr_bind_vrf() # passes when it really should not a=${VRF_IP6} log_start - show_hint "Tecnically should fail since address is not on device but kernel allows" + show_hint "Technically should fail since address is not on device but kernel allows" run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b log_test_addr ${a} $? 0 "TCP socket bind to VRF address with device bind" diff --git a/tools/testing/selftests/net/fdb_flush.sh b/tools/testing/selftests/net/fdb_flush.sh index d5e3abb8658c..9931a1e36e3d 100755 --- a/tools/testing/selftests/net/fdb_flush.sh +++ b/tools/testing/selftests/net/fdb_flush.sh @@ -583,7 +583,7 @@ vxlan_test_flush_by_remote_attributes() $IP link del dev vx10 $IP link add name vx10 type vxlan dstport "$VXPORT" external - # For multicat FDB entries, the VXLAN driver stores a linked list of + # For multicast FDB entries, the VXLAN driver stores a linked list of # remotes for a given key. Verify that only the expected remotes are # flushed. multicast_fdb_entries_add diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh index 77c83d9508d3..b39f748c2572 100755 --- a/tools/testing/selftests/net/fib_nexthops.sh +++ b/tools/testing/selftests/net/fib_nexthops.sh @@ -76,11 +76,13 @@ log_test() printf "TEST: %-60s [ OK ]\n" "${msg}" nsuccess=$((nsuccess+1)) else - ret=1 - nfail=$((nfail+1)) if [[ $rc -eq $ksft_skip ]]; then + [[ $ret -eq 0 ]] && ret=$ksft_skip + nskip=$((nskip+1)) printf "TEST: %-60s [SKIP]\n" "${msg}" else + ret=1 + nfail=$((nfail+1)) printf "TEST: %-60s [FAIL]\n" "${msg}" fi @@ -741,7 +743,7 @@ ipv6_fcnal() run_cmd "$IP nexthop add id 52 via 2001:db8:92::3" log_test $? 2 "Create nexthop - gw only" - # gw is not reachable throught given dev + # gw is not reachable through given dev run_cmd "$IP nexthop add id 53 via 2001:db8:3::3 dev veth1" log_test $? 2 "Create nexthop - invalid gw+dev combination" @@ -2528,6 +2530,7 @@ done if [ "$TESTS" != "none" ]; then printf "\nTests passed: %3d\n" ${nsuccess} printf "Tests failed: %3d\n" ${nfail} + printf "Tests skipped: %2d\n" ${nskip} fi exit $ret diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh index 847936363a12..b866bab1d92a 100755 --- a/tools/testing/selftests/net/fib_rule_tests.sh +++ b/tools/testing/selftests/net/fib_rule_tests.sh @@ -256,6 +256,24 @@ fib_rule6_test() fib_rule6_test_match_n_redirect "$match" "$match" \ "$getnomatch" "sport and dport redirect to table" \ "sport and dport no redirect to table" + + match="sport 100-200 dport 300-400" + getmatch="sport 100 dport 400" + getnomatch="sport 100 dport 401" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" \ + "sport and dport range redirect to table" \ + "sport and dport range no redirect to table" + fi + + ip rule help 2>&1 | grep sport | grep -q MASK + if [ $? -eq 0 ]; then + match="sport 0x0f00/0xff00 dport 0x000f/0x00ff" + getmatch="sport 0x0f11 dport 0x220f" + getnomatch="sport 0x1f11 dport 0x221f" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "sport and dport masked redirect to table" \ + "sport and dport masked no redirect to table" fi fib_check_iproute_support "ipproto" "ipproto" @@ -292,6 +310,25 @@ fib_rule6_test() "iif dscp no redirect to table" fi + ip rule help 2>&1 | grep -q "DSCP\[/MASK\]" + if [ $? -eq 0 ]; then + match="dscp 0x0f/0x0f" + tosmatch=$(printf 0x"%x" $((0x1f << 2))) + tosnomatch=$(printf 0x"%x" $((0x1e << 2))) + getmatch="tos $tosmatch" + getnomatch="tos $tosnomatch" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "dscp masked redirect to table" \ + "dscp masked no redirect to table" + + match="dscp 0x0f/0x0f" + getmatch="from $SRC_IP6 iif $DEV tos $tosmatch" + getnomatch="from $SRC_IP6 iif $DEV tos $tosnomatch" + fib_rule6_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif dscp masked redirect to table" \ + "iif dscp masked no redirect to table" + fi + fib_check_iproute_support "flowlabel" "flowlabel" if [ $? -eq 0 ]; then match="flowlabel 0xfffff" @@ -525,6 +562,24 @@ fib_rule4_test() fib_rule4_test_match_n_redirect "$match" "$match" \ "$getnomatch" "sport and dport redirect to table" \ "sport and dport no redirect to table" + + match="sport 100-200 dport 300-400" + getmatch="sport 100 dport 400" + getnomatch="sport 100 dport 401" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" \ + "sport and dport range redirect to table" \ + "sport and dport range no redirect to table" + fi + + ip rule help 2>&1 | grep sport | grep -q MASK + if [ $? -eq 0 ]; then + match="sport 0x0f00/0xff00 dport 0x000f/0x00ff" + getmatch="sport 0x0f11 dport 0x220f" + getnomatch="sport 0x1f11 dport 0x221f" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "sport and dport masked redirect to table" \ + "sport and dport masked no redirect to table" fi fib_check_iproute_support "ipproto" "ipproto" @@ -561,6 +616,25 @@ fib_rule4_test() "$getnomatch" "iif dscp redirect to table" \ "iif dscp no redirect to table" fi + + ip rule help 2>&1 | grep -q "DSCP\[/MASK\]" + if [ $? -eq 0 ]; then + match="dscp 0x0f/0x0f" + tosmatch=$(printf 0x"%x" $((0x1f << 2))) + tosnomatch=$(printf 0x"%x" $((0x1e << 2))) + getmatch="tos $tosmatch" + getnomatch="tos $tosnomatch" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "dscp masked redirect to table" \ + "dscp masked no redirect to table" + + match="dscp 0x0f/0x0f" + getmatch="from $SRC_IP iif $DEV tos $tosmatch" + getnomatch="from $SRC_IP iif $DEV tos $tosnomatch" + fib_rule4_test_match_n_redirect "$match" "$getmatch" \ + "$getnomatch" "iif dscp masked redirect to table" \ + "iif dscp masked no redirect to table" + fi } fib_rule4_vrf_test() diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README index a652429bfd53..7b41cff993ad 100644 --- a/tools/testing/selftests/net/forwarding/README +++ b/tools/testing/selftests/net/forwarding/README @@ -6,7 +6,7 @@ to easily create and test complex environments. Unfortunately, these namespaces can not be used with actual switching ASICs, as their ports can not be migrated to other network namespaces -(dev->netns_local) and most of them probably do not support the +(dev->netns_immutable) and most of them probably do not support the L1-separation provided by namespaces. However, a similar kind of flexibility can be achieved by using VRFs and diff --git a/tools/testing/selftests/net/forwarding/bridge_mdb.sh b/tools/testing/selftests/net/forwarding/bridge_mdb.sh index d9d587454d20..8c1597ebc2d3 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mdb.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mdb.sh @@ -149,7 +149,7 @@ cfg_test_host_common() check_err $? "Failed to add $name host entry" bridge mdb replace dev br0 port br0 grp $grp $state vid 10 &> /dev/null - check_fail $? "Managed to replace $name host entry" + check_err $? "Failed to replace $name host entry" bridge mdb del dev br0 port br0 grp $grp $state vid 10 bridge mdb get dev br0 grp $grp vid 10 &> /dev/null diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 8de80acf249e..508f3c700d71 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -291,16 +291,6 @@ if [[ "$CHECK_TC" = "yes" ]]; then check_tc_version fi -require_command() -{ - local cmd=$1; shift - - if [[ ! -x "$(command -v "$cmd")" ]]; then - echo "SKIP: $cmd not installed" - exit $ksft_skip - fi -} - # IPv6 support was added in v3.0 check_mtools_version() { diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index 3f9d50f1ef9e..b43816dd998c 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -428,6 +428,14 @@ __test_flood() test_flood() { __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood" + + # Add an entry with arbitrary destination IP. Verify that packets are + # not duplicated (this can happen if hardware floods the packets, and + # then traps them due to misconfiguration, so software data path repeats + # flooding and resends packets). + bridge fdb append dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self + __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood, unresolved FDB entry" + bridge fdb del dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self } vxlan_fdb_add_del() @@ -740,6 +748,8 @@ test_learning() vxlan_flood_test $mac $dst 0 10 0 + # The entry should age out when it only forwards traffic + $MZ $h1 -c 50 -d 1sec -p 64 -b $mac -B $dst -t icmp -q & sleep 60 bridge fdb show brport vx1 | grep $mac | grep -q self diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh index fb9a34cb50c6..afc65647f673 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh @@ -539,6 +539,21 @@ test_flood() 10 10 0 10 0 __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 "flood vlan 20" \ 10 0 10 0 10 + + # Add entries with arbitrary destination IP. Verify that packets are + # not duplicated (this can happen if hardware floods the packets, and + # then traps them due to misconfiguration, so software data path repeats + # flooding and resends packets). + bridge fdb append dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self + bridge fdb append dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self + + __test_flood de:ad:be:ef:13:37 192.0.2.100 10 \ + "flood vlan 10, unresolved FDB entry" 10 10 0 10 0 + __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 \ + "flood vlan 20, unresolved FDB entry" 10 0 10 0 10 + + bridge fdb del dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self + bridge fdb del dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self } vxlan_fdb_add_del() diff --git a/tools/testing/selftests/net/gro.c b/tools/testing/selftests/net/gro.c index b2184847e388..d5824eadea10 100644 --- a/tools/testing/selftests/net/gro.c +++ b/tools/testing/selftests/net/gro.c @@ -1318,11 +1318,13 @@ int main(int argc, char **argv) read_MAC(src_mac, smac); read_MAC(dst_mac, dmac); - if (tx_socket) + if (tx_socket) { gro_sender(); - else + } else { + /* Only the receiver exit status determines test success. */ gro_receiver(); + fprintf(stderr, "Gro::%s test passed.\n", testname); + } - fprintf(stderr, "Gro::%s test passed.\n", testname); return 0; } diff --git a/tools/testing/selftests/net/gro.sh b/tools/testing/selftests/net/gro.sh index 02c21ff4ca81..9e3f186bc2a1 100755 --- a/tools/testing/selftests/net/gro.sh +++ b/tools/testing/selftests/net/gro.sh @@ -18,10 +18,10 @@ run_test() { "--smac" "${CLIENT_MAC}" "--test" "${test}" "--verbose" ) setup_ns - # Each test is run 3 times to deflake, because given the receive timing, + # Each test is run 6 times to deflake, because given the receive timing, # not all packets that should coalesce will be considered in the same flow # on every try. - for tries in {1..3}; do + for tries in {1..6}; do # Actual test starts here ip netns exec $server_ns ./gro "${ARGS[@]}" "--rx" "--iface" "server" \ 1>>log.txt & @@ -100,5 +100,6 @@ trap cleanup EXIT if [[ "${test}" == "all" ]]; then run_all_tests else - run_test "${proto}" "${test}" + exit_code=$(run_test "${proto}" "${test}") + exit $exit_code fi; diff --git a/tools/testing/selftests/net/ip_local_port_range.sh b/tools/testing/selftests/net/ip_local_port_range.sh index 6c6ad346eaa0..4ff746db1256 100755 --- a/tools/testing/selftests/net/ip_local_port_range.sh +++ b/tools/testing/selftests/net/ip_local_port_range.sh @@ -2,4 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 ./in_netns.sh \ - sh -c 'sysctl -q -w net.ipv4.ip_local_port_range="40000 49999" && ./ip_local_port_range' + sh -c 'sysctl -q -w net.mptcp.enabled=1 && \ + sysctl -q -w net.ipv4.ip_local_port_range="40000 49999" && \ + ./ip_local_port_range' diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 0bd9a038a1f0..975be4fdbcdb 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -450,6 +450,25 @@ kill_process() { kill $pid && wait $pid; } 2>/dev/null } +check_command() +{ + local cmd=$1; shift + + if [[ ! -x "$(command -v "$cmd")" ]]; then + log_test_skip "$cmd not installed" + return $EXIT_STATUS + fi +} + +require_command() +{ + local cmd=$1; shift + + if ! check_command "$cmd"; then + exit $EXIT_STATUS + fi +} + ip_link_add() { local name=$1; shift diff --git a/tools/testing/selftests/net/lib/py/__init__.py b/tools/testing/selftests/net/lib/py/__init__.py index 54d8f5eba810..8697bd27dc30 100644 --- a/tools/testing/selftests/net/lib/py/__init__.py +++ b/tools/testing/selftests/net/lib/py/__init__.py @@ -2,8 +2,8 @@ from .consts import KSRC from .ksft import * -from .netns import NetNS +from .netns import NetNS, NetNSEnter from .nsim import * from .utils import * -from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily +from .ynl import NlError, YnlFamily, EthtoolFamily, NetdevFamily, RtnlFamily, RtnlAddrFamily from .ynl import NetshaperFamily diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 3efe005436cd..3cfad0fd4570 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -71,6 +71,11 @@ def ksft_in(a, b, comment=""): _fail("Check failed", a, "not in", b, comment) +def ksft_not_in(a, b, comment=""): + if a in b: + _fail("Check failed", a, "in", b, comment) + + def ksft_is(a, b, comment=""): if a is not b: _fail("Check failed", a, "is not", b, comment) @@ -202,7 +207,7 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} - print("KTAP version 1") + print("TAP version 13") print("1.." + str(len(cases))) global KSFT_RESULT diff --git a/tools/testing/selftests/net/lib/py/netns.py b/tools/testing/selftests/net/lib/py/netns.py index ecff85f9074f..8e9317044eef 100644 --- a/tools/testing/selftests/net/lib/py/netns.py +++ b/tools/testing/selftests/net/lib/py/netns.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: GPL-2.0 from .utils import ip +import ctypes import random import string +libc = ctypes.cdll.LoadLibrary('libc.so.6') + class NetNS: def __init__(self, name=None): @@ -29,3 +32,18 @@ class NetNS: def __repr__(self): return f"NetNS({self.name})" + + +class NetNSEnter: + def __init__(self, ns_name): + self.ns_path = f"/run/netns/{ns_name}" + + def __enter__(self): + self.saved = open("/proc/thread-self/ns/net") + with open(self.ns_path) as ns_file: + libc.setns(ns_file.fileno(), 0) + return self + + def __exit__(self, exc_type, exc_value, traceback): + libc.setns(self.saved.fileno(), 0) + self.saved.close() diff --git a/tools/testing/selftests/net/lib/py/utils.py b/tools/testing/selftests/net/lib/py/utils.py index 9e3bcddcf3e8..34470d65d871 100644 --- a/tools/testing/selftests/net/lib/py/utils.py +++ b/tools/testing/selftests/net/lib/py/utils.py @@ -2,8 +2,10 @@ import errno import json as _json +import os import random import re +import select import socket import subprocess import time @@ -15,21 +17,56 @@ class CmdExitFailure(Exception): self.cmd = cmd_obj +def fd_read_timeout(fd, timeout): + rlist, _, _ = select.select([fd], [], [], timeout) + if rlist: + return os.read(fd, 1024) + else: + raise TimeoutError("Timeout waiting for fd read") + + class cmd: - def __init__(self, comm, shell=True, fail=True, ns=None, background=False, host=None, timeout=5): + """ + Execute a command on local or remote host. + + Use bkg() instead to run a command in the background. + """ + def __init__(self, comm, shell=True, fail=True, ns=None, background=False, + host=None, timeout=5, ksft_wait=None): if ns: comm = f'ip netns exec {ns} ' + comm self.stdout = None self.stderr = None self.ret = None + self.ksft_term_fd = None self.comm = comm if host: self.proc = host.cmd(comm) else: + # ksft_wait lets us wait for the background process to fully start, + # we pass an FD to the child process, and wait for it to write back. + # Similarly term_fd tells child it's time to exit. + pass_fds = () + env = os.environ.copy() + if ksft_wait is not None: + rfd, ready_fd = os.pipe() + wait_fd, self.ksft_term_fd = os.pipe() + pass_fds = (ready_fd, wait_fd, ) + env["KSFT_READY_FD"] = str(ready_fd) + env["KSFT_WAIT_FD"] = str(wait_fd) + self.proc = subprocess.Popen(comm, shell=shell, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + stderr=subprocess.PIPE, pass_fds=pass_fds, + env=env) + if ksft_wait is not None: + os.close(ready_fd) + os.close(wait_fd) + msg = fd_read_timeout(rfd, ksft_wait) + os.close(rfd) + if not msg: + raise Exception("Did not receive ready message") if not background: self.process(terminate=False, fail=fail, timeout=timeout) @@ -37,6 +74,8 @@ class cmd: if fail is None: fail = not terminate + if self.ksft_term_fd: + os.write(self.ksft_term_fd, b"1") if terminate: self.proc.terminate() stdout, stderr = self.proc.communicate(timeout) @@ -54,13 +93,36 @@ class cmd: class bkg(cmd): + """ + Run a command in the background. + + Examples usage: + + Run a command on remote host, and wait for it to finish. + This is usually paired with wait_port_listen() to make sure + the command has initialized: + + with bkg("socat ...", exit_wait=True, host=cfg.remote) as nc: + ... + + Run a command and expect it to let us know that it's ready + by writing to a special file descriptor passed via KSFT_READY_FD. + Command will be terminated when we exit the context manager: + + with bkg("my_binary", ksft_wait=5): + """ def __init__(self, comm, shell=True, fail=None, ns=None, host=None, - exit_wait=False): + exit_wait=False, ksft_wait=None): super().__init__(comm, background=True, - shell=shell, fail=fail, ns=ns, host=host) - self.terminate = not exit_wait + shell=shell, fail=fail, ns=ns, host=host, + ksft_wait=ksft_wait) + self.terminate = not exit_wait and not ksft_wait self.check_fail = fail + if shell and self.terminate: + print("# Warning: combining shell and terminate is risky!") + print("# SIGTERM may not reach the child on zsh/ksh!") + def __enter__(self): return self @@ -123,20 +185,13 @@ def ethtool(args, json=None, ns=None, host=None): return tool('ethtool', args, json=json, ns=ns, host=host) -def rand_port(): +def rand_port(type=socket.SOCK_STREAM): """ - Get a random unprivileged port, try to make sure it's not already used. + Get a random unprivileged port. """ - for _ in range(1000): - port = random.randint(10000, 65535) - try: - with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s: - s.bind(("", port)) - return port - except OSError as e: - if e.errno != errno.EADDRINUSE: - raise - raise Exception("Can't find any free unprivileged port") + with socket.socket(socket.AF_INET6, type) as s: + s.bind(("", 0)) + return s.getsockname()[1] def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5): diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index ad1e36baee2a..8986c584cb37 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -42,6 +42,10 @@ class RtnlFamily(YnlFamily): super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), schema='', recv_size=recv_size) +class RtnlAddrFamily(YnlFamily): + def __init__(self, recv_size=0): + super().__init__((SPEC_PATH / Path('rt_addr.yaml')).as_posix(), + schema='', recv_size=recv_size) class NetdevFamily(YnlFamily): def __init__(self, recv_size=0): diff --git a/tools/testing/selftests/net/link_netns.py b/tools/testing/selftests/net/link_netns.py new file mode 100755 index 000000000000..aab043c59d69 --- /dev/null +++ b/tools/testing/selftests/net/link_netns.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import time + +from lib.py import ksft_run, ksft_exit, ksft_true +from lib.py import ip +from lib.py import NetNS, NetNSEnter +from lib.py import RtnlFamily + + +LINK_NETNSID = 100 + + +def test_event() -> None: + with NetNS() as ns1, NetNS() as ns2: + with NetNSEnter(str(ns2)): + rtnl = RtnlFamily() + + rtnl.ntf_subscribe("rtnlgrp-link") + + ip(f"netns set {ns2} {LINK_NETNSID}", ns=str(ns1)) + ip(f"link add netns {ns1} link-netnsid {LINK_NETNSID} dummy1 type dummy") + ip(f"link add netns {ns1} dummy2 type dummy", ns=str(ns2)) + + ip("link del dummy1", ns=str(ns1)) + ip("link del dummy2", ns=str(ns1)) + + time.sleep(1) + rtnl.check_ntf() + ksft_true(rtnl.async_msg_queue.empty(), + "Received unexpected link notification") + + +def validate_link_netns(netns, ifname, link_netnsid) -> bool: + link_info = ip(f"-d link show dev {ifname}", ns=netns, json=True) + if not link_info: + return False + return link_info[0].get("link_netnsid") == link_netnsid + + +def test_link_net() -> None: + configs = [ + # type, common args, type args, fallback to dev_net + ("ipvlan", "link dummy1", "", False), + ("macsec", "link dummy1", "", False), + ("macvlan", "link dummy1", "", False), + ("macvtap", "link dummy1", "", False), + ("vlan", "link dummy1", "id 100", False), + ("gre", "", "local 192.0.2.1", True), + ("vti", "", "local 192.0.2.1", True), + ("ipip", "", "local 192.0.2.1", True), + ("ip6gre", "", "local 2001:db8::1", True), + ("ip6tnl", "", "local 2001:db8::1", True), + ("vti6", "", "local 2001:db8::1", True), + ("sit", "", "local 192.0.2.1", True), + ("xfrm", "", "if_id 1", True), + ] + + with NetNS() as ns1, NetNS() as ns2, NetNS() as ns3: + net1, net2, net3 = str(ns1), str(ns2), str(ns3) + + # prepare link netnsid and a dummy link needed by certain drivers + ip(f"netns set {net3} {LINK_NETNSID}", ns=str(net2)) + ip("link add dummy1 type dummy", ns=net3) + + cases = [ + # source, "netns", "link-netns", expected link-netns + (net3, None, None, None, None), + (net3, net2, None, None, LINK_NETNSID), + (net2, None, net3, LINK_NETNSID, LINK_NETNSID), + (net1, net2, net3, LINK_NETNSID, LINK_NETNSID), + ] + + for src_net, netns, link_netns, exp1, exp2 in cases: + tgt_net = netns or src_net + for typ, cargs, targs, fb_dev_net in configs: + cmd = "link add" + if netns: + cmd += f" netns {netns}" + if link_netns: + cmd += f" link-netns {link_netns}" + cmd += f" {cargs} foo type {typ} {targs}" + ip(cmd, ns=src_net) + if fb_dev_net: + ksft_true(validate_link_netns(tgt_net, "foo", exp1), + f"{typ} link_netns validation failed") + else: + ksft_true(validate_link_netns(tgt_net, "foo", exp2), + f"{typ} link_netns validation failed") + ip(f"link del foo", ns=tgt_net) + + +def test_peer_net() -> None: + types = [ + "vxcan", + "netkit", + "veth", + ] + + with NetNS() as ns1, NetNS() as ns2, NetNS() as ns3, NetNS() as ns4: + net1, net2, net3, net4 = str(ns1), str(ns2), str(ns3), str(ns4) + + ip(f"netns set {net3} {LINK_NETNSID}", ns=str(net2)) + + cases = [ + # source, "netns", "link-netns", "peer netns", expected + (net1, None, None, None, None), + (net1, net2, None, None, None), + (net2, None, net3, None, LINK_NETNSID), + (net1, net2, net3, None, None), + (net2, None, None, net3, LINK_NETNSID), + (net1, net2, None, net3, LINK_NETNSID), + (net2, None, net2, net3, LINK_NETNSID), + (net1, net2, net4, net3, LINK_NETNSID), + ] + + for src_net, netns, link_netns, peer_netns, exp in cases: + tgt_net = netns or src_net + for typ in types: + cmd = "link add" + if netns: + cmd += f" netns {netns}" + if link_netns: + cmd += f" link-netns {link_netns}" + cmd += f" foo type {typ}" + if peer_netns: + cmd += f" peer netns {peer_netns}" + ip(cmd, ns=src_net) + ksft_true(validate_link_netns(tgt_net, "foo", exp), + f"{typ} peer_netns validation failed") + ip(f"link del foo", ns=tgt_net) + + +def main() -> None: + ksft_run([test_event, test_link_net, test_peer_net]) + ksft_exit() + + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index c76525fe2b84..340e1a777e16 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -7,7 +7,7 @@ CFLAGS += -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include $(KHDR_INC TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \ simult_flows.sh mptcp_sockopt.sh userspace_pm.sh -TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq +TEST_GEN_FILES = mptcp_connect pm_nl_ctl mptcp_sockopt mptcp_inq mptcp_diag TEST_FILES := mptcp_lib.sh settings diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 2bd0c1eb70c5..4f55477ffe08 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -200,6 +200,32 @@ chk_msk_cestab() "${expected}" "${msg}" "" } +chk_dump_one() +{ + local ss_token + local token + local msg + + ss_token="$(ss -inmHMN $ns | grep 'token:' |\ + head -n 1 |\ + sed 's/.*token:\([0-9a-f]*\).*/\1/')" + + token="$(ip netns exec $ns ./mptcp_diag -t $ss_token |\ + awk -F':[ \t]+' '/^token/ {print $2}')" + + msg="....chk dump_one" + + mptcp_lib_print_title "$msg" + if [ -n "$ss_token" ] && [ "$ss_token" = "$token" ]; then + mptcp_lib_pr_ok + mptcp_lib_result_pass "${msg}" + else + mptcp_lib_pr_fail "expected $ss_token found $token" + mptcp_lib_result_fail "${msg}" + ret=${KSFT_FAIL} + fi +} + msk_info_get_value() { local port="${1}" @@ -290,6 +316,7 @@ chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" chk_msk_inuse 2 chk_msk_cestab 2 +chk_dump_one flush_pids chk_msk_inuse 0 "2->0" diff --git a/tools/testing/selftests/net/mptcp/mptcp_diag.c b/tools/testing/selftests/net/mptcp/mptcp_diag.c new file mode 100644 index 000000000000..284286c524cf --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_diag.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025, Kylin Software */ + +#include <linux/sock_diag.h> +#include <linux/rtnetlink.h> +#include <linux/inet_diag.h> +#include <linux/netlink.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <linux/tcp.h> + +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> + +#ifndef IPPROTO_MPTCP +#define IPPROTO_MPTCP 262 +#endif + +struct mptcp_info { + __u8 mptcpi_subflows; + __u8 mptcpi_add_addr_signal; + __u8 mptcpi_add_addr_accepted; + __u8 mptcpi_subflows_max; + __u8 mptcpi_add_addr_signal_max; + __u8 mptcpi_add_addr_accepted_max; + __u32 mptcpi_flags; + __u32 mptcpi_token; + __u64 mptcpi_write_seq; + __u64 mptcpi_snd_una; + __u64 mptcpi_rcv_nxt; + __u8 mptcpi_local_addr_used; + __u8 mptcpi_local_addr_max; + __u8 mptcpi_csum_enabled; + __u32 mptcpi_retransmits; + __u64 mptcpi_bytes_retrans; + __u64 mptcpi_bytes_sent; + __u64 mptcpi_bytes_received; + __u64 mptcpi_bytes_acked; + __u8 mptcpi_subflows_total; + __u8 reserved[3]; + __u32 mptcpi_last_data_sent; + __u32 mptcpi_last_data_recv; + __u32 mptcpi_last_ack_recv; +}; + +static void die_perror(const char *msg) +{ + perror(msg); + exit(1); +} + +static void die_usage(int r) +{ + fprintf(stderr, "Usage: mptcp_diag -t\n"); + exit(r); +} + +static void send_query(int fd, __u32 token) +{ + struct sockaddr_nl nladdr = { + .nl_family = AF_NETLINK + }; + struct { + struct nlmsghdr nlh; + struct inet_diag_req_v2 r; + } req = { + .nlh = { + .nlmsg_len = sizeof(req), + .nlmsg_type = SOCK_DIAG_BY_FAMILY, + .nlmsg_flags = NLM_F_REQUEST + }, + .r = { + .sdiag_family = AF_INET, + /* Real proto is set via INET_DIAG_REQ_PROTOCOL */ + .sdiag_protocol = IPPROTO_TCP, + .id.idiag_cookie[0] = token, + } + }; + struct rtattr rta_proto; + struct iovec iov[6]; + int iovlen = 1; + __u32 proto; + + req.r.idiag_ext |= (1 << (INET_DIAG_INFO - 1)); + proto = IPPROTO_MPTCP; + rta_proto.rta_type = INET_DIAG_REQ_PROTOCOL; + rta_proto.rta_len = RTA_LENGTH(sizeof(proto)); + + iov[0] = (struct iovec) { + .iov_base = &req, + .iov_len = sizeof(req) + }; + iov[iovlen] = (struct iovec){ &rta_proto, sizeof(rta_proto)}; + iov[iovlen + 1] = (struct iovec){ &proto, sizeof(proto)}; + req.nlh.nlmsg_len += RTA_LENGTH(sizeof(proto)); + iovlen += 2; + struct msghdr msg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = iov, + .msg_iovlen = iovlen + }; + + for (;;) { + if (sendmsg(fd, &msg, 0) < 0) { + if (errno == EINTR) + continue; + die_perror("sendmsg"); + } + break; + } +} + +static void parse_rtattr_flags(struct rtattr *tb[], int max, struct rtattr *rta, + int len, unsigned short flags) +{ + unsigned short type; + + memset(tb, 0, sizeof(struct rtattr *) * (max + 1)); + while (RTA_OK(rta, len)) { + type = rta->rta_type & ~flags; + if (type <= max && !tb[type]) + tb[type] = rta; + rta = RTA_NEXT(rta, len); + } +} + +static void print_info_msg(struct mptcp_info *info) +{ + printf("Token & Flags\n"); + printf("token: %x\n", info->mptcpi_token); + printf("flags: %x\n", info->mptcpi_flags); + printf("csum_enabled: %u\n", info->mptcpi_csum_enabled); + + printf("\nBasic Info\n"); + printf("subflows: %u\n", info->mptcpi_subflows); + printf("subflows_max: %u\n", info->mptcpi_subflows_max); + printf("subflows_total: %u\n", info->mptcpi_subflows_total); + printf("local_addr_used: %u\n", info->mptcpi_local_addr_used); + printf("local_addr_max: %u\n", info->mptcpi_local_addr_max); + printf("add_addr_signal: %u\n", info->mptcpi_add_addr_signal); + printf("add_addr_accepted: %u\n", info->mptcpi_add_addr_accepted); + printf("add_addr_signal_max: %u\n", info->mptcpi_add_addr_signal_max); + printf("add_addr_accepted_max: %u\n", info->mptcpi_add_addr_accepted_max); + + printf("\nTransmission Info\n"); + printf("write_seq: %llu\n", info->mptcpi_write_seq); + printf("snd_una: %llu\n", info->mptcpi_snd_una); + printf("rcv_nxt: %llu\n", info->mptcpi_rcv_nxt); + printf("last_data_sent: %u\n", info->mptcpi_last_data_sent); + printf("last_data_recv: %u\n", info->mptcpi_last_data_recv); + printf("last_ack_recv: %u\n", info->mptcpi_last_ack_recv); + printf("retransmits: %u\n", info->mptcpi_retransmits); + printf("retransmit bytes: %llu\n", info->mptcpi_bytes_retrans); + printf("bytes_sent: %llu\n", info->mptcpi_bytes_sent); + printf("bytes_received: %llu\n", info->mptcpi_bytes_received); + printf("bytes_acked: %llu\n", info->mptcpi_bytes_acked); +} + +static void parse_nlmsg(struct nlmsghdr *nlh) +{ + struct inet_diag_msg *r = NLMSG_DATA(nlh); + struct rtattr *tb[INET_DIAG_MAX + 1]; + + parse_rtattr_flags(tb, INET_DIAG_MAX, (struct rtattr *)(r + 1), + nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)), + NLA_F_NESTED); + + if (tb[INET_DIAG_INFO]) { + int len = RTA_PAYLOAD(tb[INET_DIAG_INFO]); + struct mptcp_info *info; + + /* workaround fort older kernels with less fields */ + if (len < sizeof(*info)) { + info = alloca(sizeof(*info)); + memcpy(info, RTA_DATA(tb[INET_DIAG_INFO]), len); + memset((char *)info + len, 0, sizeof(*info) - len); + } else { + info = RTA_DATA(tb[INET_DIAG_INFO]); + } + print_info_msg(info); + } +} + +static void recv_nlmsg(int fd, struct nlmsghdr *nlh) +{ + char rcv_buff[8192]; + struct sockaddr_nl rcv_nladdr = { + .nl_family = AF_NETLINK + }; + struct iovec rcv_iov = { + .iov_base = rcv_buff, + .iov_len = sizeof(rcv_buff) + }; + struct msghdr rcv_msg = { + .msg_name = &rcv_nladdr, + .msg_namelen = sizeof(rcv_nladdr), + .msg_iov = &rcv_iov, + .msg_iovlen = 1 + }; + int len; + + len = recvmsg(fd, &rcv_msg, 0); + nlh = (struct nlmsghdr *)rcv_buff; + + while (NLMSG_OK(nlh, len)) { + if (nlh->nlmsg_type == NLMSG_DONE) { + printf("NLMSG_DONE\n"); + break; + } else if (nlh->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err; + + err = (struct nlmsgerr *)NLMSG_DATA(nlh); + printf("Error %d:%s\n", + -(err->error), strerror(-(err->error))); + break; + } + parse_nlmsg(nlh); + nlh = NLMSG_NEXT(nlh, len); + } +} + +static void get_mptcpinfo(__u32 token) +{ + struct nlmsghdr *nlh = NULL; + int fd; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); + if (fd < 0) + die_perror("Netlink socket"); + + send_query(fd, token); + recv_nlmsg(fd, nlh); + + close(fd); +} + +static void parse_opts(int argc, char **argv, __u32 *target_token) +{ + int c; + + if (argc < 2) + die_usage(1); + + while ((c = getopt(argc, argv, "ht:")) != -1) { + switch (c) { + case 'h': + die_usage(0); + break; + case 't': + sscanf(optarg, "%x", target_token); + break; + default: + die_usage(1); + break; + } + } +} + +int main(int argc, char *argv[]) +{ + __u32 target_token; + + parse_opts(argc, argv, &target_token); + get_mptcpinfo(target_token); + + return 0; +} + diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index 9c2a415976cb..2329c2f8519b 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -28,7 +28,7 @@ size=0 usage() { echo "Usage: $0 [ -b ] [ -c ] [ -d ] [ -i]" - echo -e "\t-b: bail out after first error, otherwise runs al testcases" + echo -e "\t-b: bail out after first error, otherwise runs all testcases" echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)" echo -e "\t-d: debug this script" echo -e "\t-i: use 'ip mptcp' instead of 'pm_nl_ctl'" diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 3651f73451cf..333064b0b5ac 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -117,7 +117,36 @@ cleanup() trap cleanup EXIT # Create and configure network namespaces for testing +print_title "Init" mptcp_lib_ns_init ns1 ns2 + +# check path_manager and pm_type sysctl mapping +if [ -f /proc/sys/net/mptcp/path_manager ]; then + ip netns exec "$ns1" sysctl -q net.mptcp.path_manager=userspace + pm_type="$(ip netns exec "$ns1" sysctl -n net.mptcp.pm_type)" + if [ "${pm_type}" != "1" ]; then + test_fail "unexpected pm_type: ${pm_type}" + mptcp_lib_result_print_all_tap + exit ${KSFT_FAIL} + fi + + ip netns exec "$ns1" sysctl -q net.mptcp.path_manager=error 2>/dev/null + pm_type="$(ip netns exec "$ns1" sysctl -n net.mptcp.pm_type)" + if [ "${pm_type}" != "1" ]; then + test_fail "unexpected pm_type after error: ${pm_type}" + mptcp_lib_result_print_all_tap + exit ${KSFT_FAIL} + fi + + ip netns exec "$ns1" sysctl -q net.mptcp.pm_type=0 + pm_name="$(ip netns exec "$ns1" sysctl -n net.mptcp.path_manager)" + if [ "${pm_name}" != "kernel" ]; then + test_fail "unexpected path-manager: ${pm_name}" + mptcp_lib_result_print_all_tap + exit ${KSFT_FAIL} + fi +fi + for i in "$ns1" "$ns2" ;do ip netns exec "$i" sysctl -q net.mptcp.pm_type=1 done @@ -152,7 +181,6 @@ mptcp_lib_events "${ns1}" "${server_evts}" server_evts_pid sleep 0.5 mptcp_lib_subtests_last_ts_reset -print_title "Init" print_test "Created network namespaces ns1, ns2" test_pass diff --git a/tools/testing/selftests/net/netns-name.sh b/tools/testing/selftests/net/netns-name.sh index 6974474c26f3..0be1905d1f2f 100755 --- a/tools/testing/selftests/net/netns-name.sh +++ b/tools/testing/selftests/net/netns-name.sh @@ -78,6 +78,16 @@ ip -netns $NS link show dev $ALT_NAME 2> /dev/null && fail "Can still find alt-name after move" ip -netns $test_ns link del $DEV || fail +# +# Test no conflict of the same name/ifindex in different netns +# +ip -netns $NS link add name $DEV index 100 type dummy || fail +ip -netns $NS link add netns $test_ns name $DEV index 100 type dummy || + fail "Can create in netns without moving" +ip -netns $test_ns link show dev $DEV >> /dev/null || fail "Device not found" +ip -netns $NS link del $DEV || fail +ip -netns $test_ns link del $DEV || fail + echo -ne "$(basename $0) \t\t\t\t" if [ $RET_CODE -eq 0 ]; then echo "[ OK ]" diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py index 93e8cb671c3d..beaee5e4e2aa 100755 --- a/tools/testing/selftests/net/nl_netdev.py +++ b/tools/testing/selftests/net/nl_netdev.py @@ -35,6 +35,21 @@ def napi_list_check(nf) -> None: comment=f"queue count after reset queue {q} mode {i}") +def nsim_rxq_reset_down(nf) -> None: + """ + Test that the queue API supports resetting a queue + while the interface is down. We should convert this + test to testing real HW once more devices support + queue API. + """ + with NetdevSimDev(queue_count=4) as nsimdev: + nsim = nsimdev.nsims[0] + + ip(f"link set dev {nsim.ifname} down") + for i in [0, 2, 3]: + nsim.dfs_write("queue_reset", f"1 {i}") + + def page_pool_check(nf) -> None: with NetdevSimDev() as nsimdev: nsim = nsimdev.nsims[0] @@ -106,7 +121,8 @@ def page_pool_check(nf) -> None: def main() -> None: nf = NetdevFamily() - ksft_run([empty_check, lo_check, page_pool_check, napi_list_check], + ksft_run([empty_check, lo_check, page_pool_check, napi_list_check, + nsim_rxq_reset_down], args=(nf, )) ksft_exit() diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index 960e1ab4dd04..3c8d3455d8e7 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -330,6 +330,11 @@ test_psample() { # - drop packets and verify the right drop reason is reported test_drop_reason() { which perf >/dev/null 2>&1 || return $ksft_skip + which pahole >/dev/null 2>&1 || return $ksft_skip + + ovs_drop_subsys=$(pahole -C skb_drop_reason_subsys | + awk '/OPENVSWITCH/ { print $3; }' | + tr -d ,) sbx_add "test_drop_reason" || return $? @@ -373,7 +378,7 @@ test_drop_reason() { "in_port(2),eth(),eth_type(0x0800),ipv4(src=172.31.110.20,proto=1),icmp()" 'drop' ovs_drop_record_and_run "test_drop_reason" ip netns exec client ping -c 2 172.31.110.20 - ovs_drop_reason_count 0x30001 # OVS_DROP_FLOW_ACTION + ovs_drop_reason_count 0x${ovs_drop_subsys}0001 # OVS_DROP_FLOW_ACTION if [[ "$?" -ne "2" ]]; then info "Did not detect expected drops: $?" return 1 @@ -390,7 +395,7 @@ test_drop_reason() { ovs_drop_record_and_run \ "test_drop_reason" ip netns exec client nc -i 1 -zuv 172.31.110.20 6000 - ovs_drop_reason_count 0x30004 # OVS_DROP_EXPLICIT_ACTION_ERROR + ovs_drop_reason_count 0x${ovs_drop_subsys}0004 # OVS_DROP_EXPLICIT_ACTION_ERROR if [[ "$?" -ne "1" ]]; then info "Did not detect expected explicit error drops: $?" return 1 @@ -398,7 +403,7 @@ test_drop_reason() { ovs_drop_record_and_run \ "test_drop_reason" ip netns exec client nc -i 1 -zuv 172.31.110.20 7000 - ovs_drop_reason_count 0x30003 # OVS_DROP_EXPLICIT_ACTION + ovs_drop_reason_count 0x${ovs_drop_subsys}0003 # OVS_DROP_EXPLICIT_ACTION if [[ "$?" -ne "1" ]]; then info "Did not detect expected explicit drops: $?" return 1 diff --git a/tools/testing/selftests/net/proc_net_pktgen.c b/tools/testing/selftests/net/proc_net_pktgen.c new file mode 100644 index 000000000000..69444fb29577 --- /dev/null +++ b/tools/testing/selftests/net/proc_net_pktgen.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * proc_net_pktgen: kselftest for /proc/net/pktgen interface + * + * Copyright (c) 2025 Peter Seiderer <ps.report@gmx.net> + * + */ +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> + +#include "../kselftest_harness.h" + +static const char ctrl_cmd_stop[] = "stop"; +static const char ctrl_cmd_start[] = "start"; +static const char ctrl_cmd_reset[] = "reset"; + +static const char wrong_ctrl_cmd[] = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789"; + +static const char thr_cmd_add_loopback_0[] = "add_device lo@0"; +static const char thr_cmd_rm_loopback_0[] = "rem_device_all"; + +static const char wrong_thr_cmd[] = "forsureawrongcommand"; +static const char legacy_thr_cmd[] = "max_before_softirq"; + +static const char wrong_dev_cmd[] = "forsurewrongcommand"; +static const char dev_cmd_min_pkt_size_0[] = "min_pkt_size"; +static const char dev_cmd_min_pkt_size_1[] = "min_pkt_size "; +static const char dev_cmd_min_pkt_size_2[] = "min_pkt_size 0"; +static const char dev_cmd_min_pkt_size_3[] = "min_pkt_size 1"; +static const char dev_cmd_min_pkt_size_4[] = "min_pkt_size 100"; +static const char dev_cmd_min_pkt_size_5[] = "min_pkt_size=1001"; +static const char dev_cmd_min_pkt_size_6[] = "min_pkt_size =2002"; +static const char dev_cmd_min_pkt_size_7[] = "min_pkt_size= 3003"; +static const char dev_cmd_min_pkt_size_8[] = "min_pkt_size = 4004"; +static const char dev_cmd_max_pkt_size_0[] = "max_pkt_size 200"; +static const char dev_cmd_pkt_size_0[] = "pkt_size 300"; +static const char dev_cmd_imix_weights_0[] = "imix_weights 0,7 576,4 1500,1"; +static const char dev_cmd_imix_weights_1[] = "imix_weights 101,1 102,2 103,3 104,4 105,5 106,6 107,7 108,8 109,9 110,10 111,11 112,12 113,13 114,14 115,15 116,16 117,17 118,18 119,19 120,20"; +static const char dev_cmd_imix_weights_2[] = "imix_weights 100,1 102,2 103,3 104,4 105,5 106,6 107,7 108,8 109,9 110,10 111,11 112,12 113,13 114,14 115,15 116,16 117,17 118,18 119,19 120,20 121,21"; +static const char dev_cmd_imix_weights_3[] = "imix_weights"; +static const char dev_cmd_imix_weights_4[] = "imix_weights "; +static const char dev_cmd_imix_weights_5[] = "imix_weights 0"; +static const char dev_cmd_imix_weights_6[] = "imix_weights 0,"; +static const char dev_cmd_debug_0[] = "debug 1"; +static const char dev_cmd_debug_1[] = "debug 0"; +static const char dev_cmd_frags_0[] = "frags 100"; +static const char dev_cmd_delay_0[] = "delay 100"; +static const char dev_cmd_delay_1[] = "delay 2147483647"; +static const char dev_cmd_rate_0[] = "rate 0"; +static const char dev_cmd_rate_1[] = "rate 100"; +static const char dev_cmd_ratep_0[] = "ratep 0"; +static const char dev_cmd_ratep_1[] = "ratep 200"; +static const char dev_cmd_udp_src_min_0[] = "udp_src_min 1"; +static const char dev_cmd_udp_dst_min_0[] = "udp_dst_min 2"; +static const char dev_cmd_udp_src_max_0[] = "udp_src_max 3"; +static const char dev_cmd_udp_dst_max_0[] = "udp_dst_max 4"; +static const char dev_cmd_clone_skb_0[] = "clone_skb 1"; +static const char dev_cmd_clone_skb_1[] = "clone_skb 0"; +static const char dev_cmd_count_0[] = "count 100"; +static const char dev_cmd_src_mac_count_0[] = "src_mac_count 100"; +static const char dev_cmd_dst_mac_count_0[] = "dst_mac_count 100"; +static const char dev_cmd_burst_0[] = "burst 0"; +static const char dev_cmd_node_0[] = "node 100"; +static const char dev_cmd_xmit_mode_0[] = "xmit_mode start_xmit"; +static const char dev_cmd_xmit_mode_1[] = "xmit_mode netif_receive"; +static const char dev_cmd_xmit_mode_2[] = "xmit_mode queue_xmit"; +static const char dev_cmd_xmit_mode_3[] = "xmit_mode nonsense"; +static const char dev_cmd_flag_0[] = "flag UDPCSUM"; +static const char dev_cmd_flag_1[] = "flag !UDPCSUM"; +static const char dev_cmd_flag_2[] = "flag nonsense"; +static const char dev_cmd_dst_min_0[] = "dst_min 101.102.103.104"; +static const char dev_cmd_dst_0[] = "dst 101.102.103.104"; +static const char dev_cmd_dst_max_0[] = "dst_max 201.202.203.204"; +static const char dev_cmd_dst6_0[] = "dst6 2001:db38:1234:0000:0000:0000:0000:0000"; +static const char dev_cmd_dst6_min_0[] = "dst6_min 2001:db8:1234:0000:0000:0000:0000:0000"; +static const char dev_cmd_dst6_max_0[] = "dst6_max 2001:db8:1234:0000:0000:0000:0000:0000"; +static const char dev_cmd_src6_0[] = "src6 2001:db38:1234:0000:0000:0000:0000:0000"; +static const char dev_cmd_src_min_0[] = "src_min 101.102.103.104"; +static const char dev_cmd_src_max_0[] = "src_max 201.202.203.204"; +static const char dev_cmd_dst_mac_0[] = "dst_mac 01:02:03:04:05:06"; +static const char dev_cmd_src_mac_0[] = "src_mac 11:12:13:14:15:16"; +static const char dev_cmd_clear_counters_0[] = "clear_counters"; +static const char dev_cmd_flows_0[] = "flows 100"; +static const char dev_cmd_spi_0[] = "spi 100"; +static const char dev_cmd_flowlen_0[] = "flowlen 100"; +static const char dev_cmd_queue_map_min_0[] = "queue_map_min 1"; +static const char dev_cmd_queue_map_max_0[] = "queue_map_max 2"; +static const char dev_cmd_mpls_0[] = "mpls 00000001"; +static const char dev_cmd_mpls_1[] = "mpls 00000001,000000f2"; +static const char dev_cmd_mpls_2[] = "mpls 00000f00,00000f01,00000f02,00000f03,00000f04,00000f05,00000f06,00000f07,00000f08,00000f09,00000f0a,00000f0b,00000f0c,00000f0d,00000f0e,00000f0f"; +static const char dev_cmd_mpls_3[] = "mpls 00000f00,00000f01,00000f02,00000f03,00000f04,00000f05,00000f06,00000f07,00000f08,00000f09,00000f0a,00000f0b,00000f0c,00000f0d,00000f0e,00000f0f,00000f10"; +static const char dev_cmd_vlan_id_0[] = "vlan_id 1"; +static const char dev_cmd_vlan_p_0[] = "vlan_p 1"; +static const char dev_cmd_vlan_cfi_0[] = "vlan_cfi 1"; +static const char dev_cmd_vlan_id_1[] = "vlan_id 4096"; +static const char dev_cmd_svlan_id_0[] = "svlan_id 1"; +static const char dev_cmd_svlan_p_0[] = "svlan_p 1"; +static const char dev_cmd_svlan_cfi_0[] = "svlan_cfi 1"; +static const char dev_cmd_svlan_id_1[] = "svlan_id 4096"; +static const char dev_cmd_tos_0[] = "tos 0"; +static const char dev_cmd_tos_1[] = "tos 0f"; +static const char dev_cmd_tos_2[] = "tos 0ff"; +static const char dev_cmd_traffic_class_0[] = "traffic_class f0"; +static const char dev_cmd_skb_priority_0[] = "skb_priority 999"; + +FIXTURE(proc_net_pktgen) { + int ctrl_fd; + int thr_fd; + int dev_fd; +}; + +FIXTURE_SETUP(proc_net_pktgen) { + int r; + ssize_t len; + + r = system("modprobe pktgen"); + ASSERT_EQ(r, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, module pktgen not loaded?"); + + self->ctrl_fd = open("/proc/net/pktgen/pgctrl", O_RDWR); + ASSERT_GE(self->ctrl_fd, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, module pktgen not loaded?"); + + self->thr_fd = open("/proc/net/pktgen/kpktgend_0", O_RDWR); + ASSERT_GE(self->thr_fd, 0) TH_LOG("CONFIG_NET_PKTGEN not enabled, module pktgen not loaded?"); + + len = write(self->thr_fd, thr_cmd_add_loopback_0, sizeof(thr_cmd_add_loopback_0)); + ASSERT_EQ(len, sizeof(thr_cmd_add_loopback_0)) TH_LOG("device lo@0 already registered?"); + + self->dev_fd = open("/proc/net/pktgen/lo@0", O_RDWR); + ASSERT_GE(self->dev_fd, 0) TH_LOG("device entry for lo@0 missing?"); +} + +FIXTURE_TEARDOWN(proc_net_pktgen) { + int ret; + ssize_t len; + + ret = close(self->dev_fd); + EXPECT_EQ(ret, 0); + + len = write(self->thr_fd, thr_cmd_rm_loopback_0, sizeof(thr_cmd_rm_loopback_0)); + EXPECT_EQ(len, sizeof(thr_cmd_rm_loopback_0)); + + ret = close(self->thr_fd); + EXPECT_EQ(ret, 0); + + ret = close(self->ctrl_fd); + EXPECT_EQ(ret, 0); +} + +TEST_F(proc_net_pktgen, wrong_ctrl_cmd) { + for (int i = 0; i <= sizeof(wrong_ctrl_cmd); i++) { + ssize_t len; + + len = write(self->ctrl_fd, wrong_ctrl_cmd, i); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + } +} + +TEST_F(proc_net_pktgen, ctrl_cmd) { + ssize_t len; + + len = write(self->ctrl_fd, ctrl_cmd_stop, sizeof(ctrl_cmd_stop)); + EXPECT_EQ(len, sizeof(ctrl_cmd_stop)); + + len = write(self->ctrl_fd, ctrl_cmd_stop, sizeof(ctrl_cmd_stop) - 1); + EXPECT_EQ(len, sizeof(ctrl_cmd_stop) - 1); + + len = write(self->ctrl_fd, ctrl_cmd_start, sizeof(ctrl_cmd_start)); + EXPECT_EQ(len, sizeof(ctrl_cmd_start)); + + len = write(self->ctrl_fd, ctrl_cmd_start, sizeof(ctrl_cmd_start) - 1); + EXPECT_EQ(len, sizeof(ctrl_cmd_start) - 1); + + len = write(self->ctrl_fd, ctrl_cmd_reset, sizeof(ctrl_cmd_reset)); + EXPECT_EQ(len, sizeof(ctrl_cmd_reset)); + + len = write(self->ctrl_fd, ctrl_cmd_reset, sizeof(ctrl_cmd_reset) - 1); + EXPECT_EQ(len, sizeof(ctrl_cmd_reset) - 1); +} + +TEST_F(proc_net_pktgen, wrong_thr_cmd) { + for (int i = 0; i <= sizeof(wrong_thr_cmd); i++) { + ssize_t len; + + len = write(self->thr_fd, wrong_thr_cmd, i); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + } +} + +TEST_F(proc_net_pktgen, legacy_thr_cmd) { + for (int i = 0; i <= sizeof(legacy_thr_cmd); i++) { + ssize_t len; + + len = write(self->thr_fd, legacy_thr_cmd, i); + if (i < (sizeof(legacy_thr_cmd) - 1)) { + /* incomplete command string */ + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + } else { + /* complete command string without/with trailing '\0' */ + EXPECT_EQ(len, i); + } + } +} + +TEST_F(proc_net_pktgen, wrong_dev_cmd) { + for (int i = 0; i <= sizeof(wrong_dev_cmd); i++) { + ssize_t len; + + len = write(self->dev_fd, wrong_dev_cmd, i); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + } +} + +TEST_F(proc_net_pktgen, dev_cmd_min_pkt_size) { + ssize_t len; + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_min_pkt_size_0, sizeof(dev_cmd_min_pkt_size_0)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_0)); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_min_pkt_size_0, sizeof(dev_cmd_min_pkt_size_0) - 1); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_0) - 1); + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_min_pkt_size_1, sizeof(dev_cmd_min_pkt_size_1)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_1)); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_min_pkt_size_1, sizeof(dev_cmd_min_pkt_size_1) - 1); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_1) - 1); + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_min_pkt_size_2, sizeof(dev_cmd_min_pkt_size_2)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_2)); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_min_pkt_size_2, sizeof(dev_cmd_min_pkt_size_2) - 1); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_2) - 1); + + len = write(self->dev_fd, dev_cmd_min_pkt_size_3, sizeof(dev_cmd_min_pkt_size_3)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_3)); + + len = write(self->dev_fd, dev_cmd_min_pkt_size_4, sizeof(dev_cmd_min_pkt_size_4)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_4)); + + len = write(self->dev_fd, dev_cmd_min_pkt_size_5, sizeof(dev_cmd_min_pkt_size_5)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_5)); + + len = write(self->dev_fd, dev_cmd_min_pkt_size_6, sizeof(dev_cmd_min_pkt_size_6)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_6)); + + len = write(self->dev_fd, dev_cmd_min_pkt_size_7, sizeof(dev_cmd_min_pkt_size_7)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_7)); + + len = write(self->dev_fd, dev_cmd_min_pkt_size_8, sizeof(dev_cmd_min_pkt_size_8)); + EXPECT_EQ(len, sizeof(dev_cmd_min_pkt_size_8)); +} + +TEST_F(proc_net_pktgen, dev_cmd_max_pkt_size) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_max_pkt_size_0, sizeof(dev_cmd_max_pkt_size_0)); + EXPECT_EQ(len, sizeof(dev_cmd_max_pkt_size_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_pkt_size) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_pkt_size_0, sizeof(dev_cmd_pkt_size_0)); + EXPECT_EQ(len, sizeof(dev_cmd_pkt_size_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_imix_weights) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_imix_weights_0, sizeof(dev_cmd_imix_weights_0)); + EXPECT_EQ(len, sizeof(dev_cmd_imix_weights_0)); + + len = write(self->dev_fd, dev_cmd_imix_weights_1, sizeof(dev_cmd_imix_weights_1)); + EXPECT_EQ(len, sizeof(dev_cmd_imix_weights_1)); + + len = write(self->dev_fd, dev_cmd_imix_weights_2, sizeof(dev_cmd_imix_weights_2)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, E2BIG); + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_3, sizeof(dev_cmd_imix_weights_3)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_3, sizeof(dev_cmd_imix_weights_3) - 1); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_4, sizeof(dev_cmd_imix_weights_4)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_4, sizeof(dev_cmd_imix_weights_4) - 1); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_5, sizeof(dev_cmd_imix_weights_5)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_5, sizeof(dev_cmd_imix_weights_5) - 1); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* with trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_6, sizeof(dev_cmd_imix_weights_6)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + /* without trailing '\0' */ + len = write(self->dev_fd, dev_cmd_imix_weights_6, sizeof(dev_cmd_imix_weights_6) - 1); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); +} + +TEST_F(proc_net_pktgen, dev_cmd_debug) { + ssize_t len; + + /* debug on */ + len = write(self->dev_fd, dev_cmd_debug_0, sizeof(dev_cmd_debug_0)); + EXPECT_EQ(len, sizeof(dev_cmd_debug_0)); + + /* debug off */ + len = write(self->dev_fd, dev_cmd_debug_1, sizeof(dev_cmd_debug_1)); + EXPECT_EQ(len, sizeof(dev_cmd_debug_1)); +} + +TEST_F(proc_net_pktgen, dev_cmd_frags) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_frags_0, sizeof(dev_cmd_frags_0)); + EXPECT_EQ(len, sizeof(dev_cmd_frags_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_delay) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_delay_0, sizeof(dev_cmd_delay_0)); + EXPECT_EQ(len, sizeof(dev_cmd_delay_0)); + + len = write(self->dev_fd, dev_cmd_delay_1, sizeof(dev_cmd_delay_1)); + EXPECT_EQ(len, sizeof(dev_cmd_delay_1)); +} + +TEST_F(proc_net_pktgen, dev_cmd_rate) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_rate_0, sizeof(dev_cmd_rate_0)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + len = write(self->dev_fd, dev_cmd_rate_1, sizeof(dev_cmd_rate_1)); + EXPECT_EQ(len, sizeof(dev_cmd_rate_1)); +} + +TEST_F(proc_net_pktgen, dev_cmd_ratep) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_ratep_0, sizeof(dev_cmd_ratep_0)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EINVAL); + + len = write(self->dev_fd, dev_cmd_ratep_1, sizeof(dev_cmd_ratep_1)); + EXPECT_EQ(len, sizeof(dev_cmd_ratep_1)); +} + +TEST_F(proc_net_pktgen, dev_cmd_udp_src_min) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_udp_src_min_0, sizeof(dev_cmd_udp_src_min_0)); + EXPECT_EQ(len, sizeof(dev_cmd_udp_src_min_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_udp_dst_min) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_udp_dst_min_0, sizeof(dev_cmd_udp_dst_min_0)); + EXPECT_EQ(len, sizeof(dev_cmd_udp_dst_min_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_udp_src_max) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_udp_src_max_0, sizeof(dev_cmd_udp_src_max_0)); + EXPECT_EQ(len, sizeof(dev_cmd_udp_src_max_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_udp_dst_max) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_udp_dst_max_0, sizeof(dev_cmd_udp_dst_max_0)); + EXPECT_EQ(len, sizeof(dev_cmd_udp_dst_max_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_clone_skb) { + ssize_t len; + + /* clone_skb on (gives EOPNOTSUPP on lo device) */ + len = write(self->dev_fd, dev_cmd_clone_skb_0, sizeof(dev_cmd_clone_skb_0)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, EOPNOTSUPP); + + /* clone_skb off */ + len = write(self->dev_fd, dev_cmd_clone_skb_1, sizeof(dev_cmd_clone_skb_1)); + EXPECT_EQ(len, sizeof(dev_cmd_clone_skb_1)); +} + +TEST_F(proc_net_pktgen, dev_cmd_count) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_count_0, sizeof(dev_cmd_count_0)); + EXPECT_EQ(len, sizeof(dev_cmd_count_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_src_mac_count) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_src_mac_count_0, sizeof(dev_cmd_src_mac_count_0)); + EXPECT_EQ(len, sizeof(dev_cmd_src_mac_count_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst_mac_count) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst_mac_count_0, sizeof(dev_cmd_dst_mac_count_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst_mac_count_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_burst) { + ssize_t len; + + /* burst off */ + len = write(self->dev_fd, dev_cmd_burst_0, sizeof(dev_cmd_burst_0)); + EXPECT_EQ(len, sizeof(dev_cmd_burst_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_node) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_node_0, sizeof(dev_cmd_node_0)); + EXPECT_EQ(len, sizeof(dev_cmd_node_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_xmit_mode) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_xmit_mode_0, sizeof(dev_cmd_xmit_mode_0)); + EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_0)); + + len = write(self->dev_fd, dev_cmd_xmit_mode_1, sizeof(dev_cmd_xmit_mode_1)); + EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_1)); + + len = write(self->dev_fd, dev_cmd_xmit_mode_2, sizeof(dev_cmd_xmit_mode_2)); + EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_2)); + + len = write(self->dev_fd, dev_cmd_xmit_mode_3, sizeof(dev_cmd_xmit_mode_3)); + EXPECT_EQ(len, sizeof(dev_cmd_xmit_mode_3)); +} + +TEST_F(proc_net_pktgen, dev_cmd_flag) { + ssize_t len; + + /* flag UDPCSUM on */ + len = write(self->dev_fd, dev_cmd_flag_0, sizeof(dev_cmd_flag_0)); + EXPECT_EQ(len, sizeof(dev_cmd_flag_0)); + + /* flag UDPCSUM off */ + len = write(self->dev_fd, dev_cmd_flag_1, sizeof(dev_cmd_flag_1)); + EXPECT_EQ(len, sizeof(dev_cmd_flag_1)); + + /* flag invalid */ + len = write(self->dev_fd, dev_cmd_flag_2, sizeof(dev_cmd_flag_2)); + EXPECT_EQ(len, sizeof(dev_cmd_flag_2)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst_min) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst_min_0, sizeof(dev_cmd_dst_min_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst_min_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst_0, sizeof(dev_cmd_dst_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst_max) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst_max_0, sizeof(dev_cmd_dst_max_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst_max_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst6) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst6_0, sizeof(dev_cmd_dst6_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst6_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst6_min) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst6_min_0, sizeof(dev_cmd_dst6_min_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst6_min_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst6_max) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst6_max_0, sizeof(dev_cmd_dst6_max_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst6_max_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_src6) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_src6_0, sizeof(dev_cmd_src6_0)); + EXPECT_EQ(len, sizeof(dev_cmd_src6_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_src_min) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_src_min_0, sizeof(dev_cmd_src_min_0)); + EXPECT_EQ(len, sizeof(dev_cmd_src_min_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_src_max) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_src_max_0, sizeof(dev_cmd_src_max_0)); + EXPECT_EQ(len, sizeof(dev_cmd_src_max_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_dst_mac) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_dst_mac_0, sizeof(dev_cmd_dst_mac_0)); + EXPECT_EQ(len, sizeof(dev_cmd_dst_mac_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_src_mac) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_src_mac_0, sizeof(dev_cmd_src_mac_0)); + EXPECT_EQ(len, sizeof(dev_cmd_src_mac_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_clear_counters) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_clear_counters_0, sizeof(dev_cmd_clear_counters_0)); + EXPECT_EQ(len, sizeof(dev_cmd_clear_counters_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_flows) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_flows_0, sizeof(dev_cmd_flows_0)); + EXPECT_EQ(len, sizeof(dev_cmd_flows_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_spi) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_spi_0, sizeof(dev_cmd_spi_0)); + EXPECT_EQ(len, sizeof(dev_cmd_spi_0)) TH_LOG("CONFIG_XFRM not enabled?"); +} + +TEST_F(proc_net_pktgen, dev_cmd_flowlen) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_flowlen_0, sizeof(dev_cmd_flowlen_0)); + EXPECT_EQ(len, sizeof(dev_cmd_flowlen_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_queue_map_min) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_queue_map_min_0, sizeof(dev_cmd_queue_map_min_0)); + EXPECT_EQ(len, sizeof(dev_cmd_queue_map_min_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_queue_map_max) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_queue_map_max_0, sizeof(dev_cmd_queue_map_max_0)); + EXPECT_EQ(len, sizeof(dev_cmd_queue_map_max_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_mpls) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_mpls_0, sizeof(dev_cmd_mpls_0)); + EXPECT_EQ(len, sizeof(dev_cmd_mpls_0)); + + len = write(self->dev_fd, dev_cmd_mpls_1, sizeof(dev_cmd_mpls_1)); + EXPECT_EQ(len, sizeof(dev_cmd_mpls_1)); + + len = write(self->dev_fd, dev_cmd_mpls_2, sizeof(dev_cmd_mpls_2)); + EXPECT_EQ(len, sizeof(dev_cmd_mpls_2)); + + len = write(self->dev_fd, dev_cmd_mpls_3, sizeof(dev_cmd_mpls_3)); + EXPECT_EQ(len, -1); + EXPECT_EQ(errno, E2BIG); +} + +TEST_F(proc_net_pktgen, dev_cmd_vlan_id) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_vlan_id_0, sizeof(dev_cmd_vlan_id_0)); + EXPECT_EQ(len, sizeof(dev_cmd_vlan_id_0)); + + len = write(self->dev_fd, dev_cmd_vlan_p_0, sizeof(dev_cmd_vlan_p_0)); + EXPECT_EQ(len, sizeof(dev_cmd_vlan_p_0)); + + len = write(self->dev_fd, dev_cmd_vlan_cfi_0, sizeof(dev_cmd_vlan_cfi_0)); + EXPECT_EQ(len, sizeof(dev_cmd_vlan_cfi_0)); + + len = write(self->dev_fd, dev_cmd_vlan_id_1, sizeof(dev_cmd_vlan_id_1)); + EXPECT_EQ(len, sizeof(dev_cmd_vlan_id_1)); +} + +TEST_F(proc_net_pktgen, dev_cmd_svlan_id) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_svlan_id_0, sizeof(dev_cmd_svlan_id_0)); + EXPECT_EQ(len, sizeof(dev_cmd_svlan_id_0)); + + len = write(self->dev_fd, dev_cmd_svlan_p_0, sizeof(dev_cmd_svlan_p_0)); + EXPECT_EQ(len, sizeof(dev_cmd_svlan_p_0)); + + len = write(self->dev_fd, dev_cmd_svlan_cfi_0, sizeof(dev_cmd_svlan_cfi_0)); + EXPECT_EQ(len, sizeof(dev_cmd_svlan_cfi_0)); + + len = write(self->dev_fd, dev_cmd_svlan_id_1, sizeof(dev_cmd_svlan_id_1)); + EXPECT_EQ(len, sizeof(dev_cmd_svlan_id_1)); +} + + +TEST_F(proc_net_pktgen, dev_cmd_tos) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_tos_0, sizeof(dev_cmd_tos_0)); + EXPECT_EQ(len, sizeof(dev_cmd_tos_0)); + + len = write(self->dev_fd, dev_cmd_tos_1, sizeof(dev_cmd_tos_1)); + EXPECT_EQ(len, sizeof(dev_cmd_tos_1)); + + len = write(self->dev_fd, dev_cmd_tos_2, sizeof(dev_cmd_tos_2)); + EXPECT_EQ(len, sizeof(dev_cmd_tos_2)); +} + + +TEST_F(proc_net_pktgen, dev_cmd_traffic_class) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_traffic_class_0, sizeof(dev_cmd_traffic_class_0)); + EXPECT_EQ(len, sizeof(dev_cmd_traffic_class_0)); +} + +TEST_F(proc_net_pktgen, dev_cmd_skb_priority) { + ssize_t len; + + len = write(self->dev_fd, dev_cmd_skb_priority_0, sizeof(dev_cmd_skb_priority_0)); + EXPECT_EQ(len, sizeof(dev_cmd_skb_priority_0)); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c index 404a2ce759ab..221270cee3ea 100644 --- a/tools/testing/selftests/net/psock_tpacket.c +++ b/tools/testing/selftests/net/psock_tpacket.c @@ -12,7 +12,7 @@ * * Datapath: * Open a pair of packet sockets and send resp. receive an a priori known - * packet pattern accross the sockets and check if it was received resp. + * packet pattern across the sockets and check if it was received resp. * sent correctly. Fanout in combination with RX_RING is currently not * tested here. * diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c index 066efd30e294..7b9bf8a7bbe1 100644 --- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -112,7 +112,7 @@ TEST(reuseaddr_ports_exhausted_reusable_same_euid) ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind."); if (opts->reuseport[0] && opts->reuseport[1]) { - EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened."); + EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets successfully listened."); } else { EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations."); } diff --git a/tools/testing/selftests/net/rtnetlink.py b/tools/testing/selftests/net/rtnetlink.py new file mode 100755 index 000000000000..80950888800b --- /dev/null +++ b/tools/testing/selftests/net/rtnetlink.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_exit, ksft_run, ksft_ge, RtnlAddrFamily +import socket + +IPV4_ALL_HOSTS_MULTICAST = b'\xe0\x00\x00\x01' + +def dump_mcaddr_check(rtnl: RtnlAddrFamily) -> None: + """ + Verify that at least one interface has the IPv4 all-hosts multicast address. + At least the loopback interface should have this address. + """ + + addresses = rtnl.getmaddrs({"ifa-family": socket.AF_INET}, dump=True) + + all_host_multicasts = [ + addr for addr in addresses if addr['ifa-multicast'] == IPV4_ALL_HOSTS_MULTICAST + ] + + ksft_ge(len(all_host_multicasts), 1, + "No interface found with the IPv4 all-hosts multicast address") + +def main() -> None: + rtnl = RtnlAddrFamily() + ksft_run([dump_mcaddr_check], args=(rtnl, )) + ksft_exit() + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/net/setup_veth.sh b/tools/testing/selftests/net/setup_veth.sh index 1f78a87f6f37..152bf4c65747 100644 --- a/tools/testing/selftests/net/setup_veth.sh +++ b/tools/testing/selftests/net/setup_veth.sh @@ -11,7 +11,8 @@ setup_veth_ns() { local -r ns_mac="$4" [[ -e /var/run/netns/"${ns_name}" ]] || ip netns add "${ns_name}" - echo 1000000 > "/sys/class/net/${ns_dev}/gro_flush_timeout" + echo 200000 > "/sys/class/net/${ns_dev}/gro_flush_timeout" + echo 1 > "/sys/class/net/${ns_dev}/napi_defer_hard_irqs" ip link set dev "${ns_dev}" netns "${ns_name}" mtu 65535 ip -netns "${ns_name}" link set dev "${ns_dev}" up diff --git a/tools/testing/selftests/net/so_rcv_listener.c b/tools/testing/selftests/net/so_rcv_listener.c new file mode 100644 index 000000000000..bc5841192aa6 --- /dev/null +++ b/tools/testing/selftests/net/so_rcv_listener.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <errno.h> +#include <netdb.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <linux/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#ifndef SO_RCVPRIORITY +#define SO_RCVPRIORITY 82 +#endif + +struct options { + __u32 val; + int name; + int rcvname; + const char *host; + const char *service; +} opt; + +static void __attribute__((noreturn)) usage(const char *bin) +{ + printf("Usage: %s [opts] <dst host> <dst port / service>\n", bin); + printf("Options:\n" + "\t\t-M val Test SO_RCVMARK\n" + "\t\t-P val Test SO_RCVPRIORITY\n" + ""); + exit(EXIT_FAILURE); +} + +static void parse_args(int argc, char *argv[]) +{ + int o; + + while ((o = getopt(argc, argv, "M:P:")) != -1) { + switch (o) { + case 'M': + opt.val = atoi(optarg); + opt.name = SO_MARK; + opt.rcvname = SO_RCVMARK; + break; + case 'P': + opt.val = atoi(optarg); + opt.name = SO_PRIORITY; + opt.rcvname = SO_RCVPRIORITY; + break; + default: + usage(argv[0]); + break; + } + } + + if (optind != argc - 2) + usage(argv[0]); + + opt.host = argv[optind]; + opt.service = argv[optind + 1]; +} + +int main(int argc, char *argv[]) +{ + int err = 0; + int recv_fd = -1; + int ret_value = 0; + __u32 recv_val; + struct cmsghdr *cmsg; + char cbuf[CMSG_SPACE(sizeof(__u32))]; + char recv_buf[CMSG_SPACE(sizeof(__u32))]; + struct iovec iov[1]; + struct msghdr msg; + struct sockaddr_in recv_addr4; + struct sockaddr_in6 recv_addr6; + + parse_args(argc, argv); + + int family = strchr(opt.host, ':') ? AF_INET6 : AF_INET; + + recv_fd = socket(family, SOCK_DGRAM, IPPROTO_UDP); + if (recv_fd < 0) { + perror("Can't open recv socket"); + ret_value = -errno; + goto cleanup; + } + + err = setsockopt(recv_fd, SOL_SOCKET, opt.rcvname, &opt.val, sizeof(opt.val)); + if (err < 0) { + perror("Recv setsockopt error"); + ret_value = -errno; + goto cleanup; + } + + if (family == AF_INET) { + memset(&recv_addr4, 0, sizeof(recv_addr4)); + recv_addr4.sin_family = family; + recv_addr4.sin_port = htons(atoi(opt.service)); + + if (inet_pton(family, opt.host, &recv_addr4.sin_addr) <= 0) { + perror("Invalid IPV4 address"); + ret_value = -errno; + goto cleanup; + } + + err = bind(recv_fd, (struct sockaddr *)&recv_addr4, sizeof(recv_addr4)); + } else { + memset(&recv_addr6, 0, sizeof(recv_addr6)); + recv_addr6.sin6_family = family; + recv_addr6.sin6_port = htons(atoi(opt.service)); + + if (inet_pton(family, opt.host, &recv_addr6.sin6_addr) <= 0) { + perror("Invalid IPV6 address"); + ret_value = -errno; + goto cleanup; + } + + err = bind(recv_fd, (struct sockaddr *)&recv_addr6, sizeof(recv_addr6)); + } + + if (err < 0) { + perror("Recv bind error"); + ret_value = -errno; + goto cleanup; + } + + iov[0].iov_base = recv_buf; + iov[0].iov_len = sizeof(recv_buf); + + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + + err = recvmsg(recv_fd, &msg, 0); + if (err < 0) { + perror("Message receive error"); + ret_value = -errno; + goto cleanup; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == opt.name) { + recv_val = *(__u32 *)CMSG_DATA(cmsg); + printf("Received value: %u\n", recv_val); + + if (recv_val != opt.val) { + fprintf(stderr, "Error: expected value: %u, got: %u\n", + opt.val, recv_val); + ret_value = -EINVAL; + } + goto cleanup; + } + } + + fprintf(stderr, "Error: No matching cmsg received\n"); + ret_value = -ENOMSG; + +cleanup: + if (recv_fd >= 0) + close(recv_fd); + + return ret_value; +} diff --git a/tools/testing/selftests/net/tcp_ao/connect-deny.c b/tools/testing/selftests/net/tcp_ao/connect-deny.c index d418162d335f..93b61e9a36f1 100644 --- a/tools/testing/selftests/net/tcp_ao/connect-deny.c +++ b/tools/testing/selftests/net/tcp_ao/connect-deny.c @@ -4,6 +4,7 @@ #include "aolib.h" #define fault(type) (inj == FAULT_ ## type) +static volatile int sk_pair; static inline int test_add_key_maclen(int sk, const char *key, uint8_t maclen, union tcp_addr in_addr, uint8_t prefix, @@ -34,10 +35,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, const char *cnt_name, test_cnt cnt_expected, fault_t inj) { - struct tcp_ao_counters ao_cnt1, ao_cnt2; + struct tcp_counters cnt1, cnt2; uint64_t before_cnt = 0, after_cnt = 0; /* silence GCC */ + test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected; int lsk, err, sk = 0; - time_t timeout; lsk = test_listen_socket(this_ip_addr, port, 1); @@ -46,21 +47,24 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, if (cnt_name) before_cnt = netstat_get_one(cnt_name, NULL); - if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt1)) - test_error("test_get_tcp_ao_counters()"); + if (pwd && test_get_tcp_counters(lsk, &cnt1)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* preparations done */ - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - err = test_wait_fd(lsk, timeout, 0); + err = test_skpair_wait_poll(lsk, 0, poll_cnt, &sk_pair); if (err == -ETIMEDOUT) { + sk_pair = err; if (!fault(TIMEOUT)) - test_fail("timed out for accept()"); + test_fail("%s: timed out for accept()", tst_name); + } else if (err == -EKEYREJECTED) { + if (!fault(KEYREJECT)) + test_fail("%s: key was rejected", tst_name); } else if (err < 0) { - test_error("test_wait_fd()"); + test_error("test_skpair_wait_poll()"); } else { if (fault(TIMEOUT)) - test_fail("ready to accept"); + test_fail("%s: ready to accept", tst_name); sk = accept(lsk, NULL, NULL); if (sk < 0) { @@ -72,13 +76,13 @@ static void try_accept(const char *tst_name, unsigned int port, const char *pwd, } synchronize_threads(); /* before counter checks */ - if (pwd && test_get_tcp_ao_counters(lsk, &ao_cnt2)) - test_error("test_get_tcp_ao_counters()"); + if (pwd && test_get_tcp_counters(lsk, &cnt2)) + test_error("test_get_tcp_counters()"); close(lsk); if (pwd) - test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected); + test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected); if (!cnt_name) goto out; @@ -109,7 +113,7 @@ static void *server_fn(void *arg) try_accept("Non-AO server + AO client", port++, NULL, this_ip_dest, -1, 100, 100, 0, - "TCPAOKeyNotFound", 0, FAULT_TIMEOUT); + "TCPAOKeyNotFound", TEST_CNT_NS_KEY_NOT_FOUND, FAULT_TIMEOUT); try_accept("AO server + Non-AO client", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, @@ -135,8 +139,9 @@ static void *server_fn(void *arg) wrong_addr, -1, 100, 100, 0, "TCPAOKeyNotFound", TEST_CNT_AO_KEY_NOT_FOUND, FAULT_TIMEOUT); + /* Key rejected by the other side, failing short through skpair */ try_accept("Client: Wrong addr", port++, NULL, - this_ip_dest, -1, 100, 100, 0, NULL, 0, FAULT_TIMEOUT); + this_ip_dest, -1, 100, 100, 0, NULL, 0, FAULT_KEYREJECT); try_accept("rcv id != snd id", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 200, 100, 0, @@ -163,8 +168,7 @@ static void try_connect(const char *tst_name, unsigned int port, uint8_t sndid, uint8_t rcvid, test_cnt cnt_expected, fault_t inj) { - struct tcp_ao_counters ao_cnt1, ao_cnt2; - time_t timeout; + struct tcp_counters cnt1, cnt2; int sk, ret; sk = socket(test_family, SOCK_STREAM, IPPROTO_TCP); @@ -174,16 +178,15 @@ static void try_connect(const char *tst_name, unsigned int port, if (pwd && test_add_key(sk, pwd, addr, prefix, sndid, rcvid)) test_error("setsockopt(TCP_AO_ADD_KEY)"); - if (pwd && test_get_tcp_ao_counters(sk, &ao_cnt1)) - test_error("test_get_tcp_ao_counters()"); + if (pwd && test_get_tcp_counters(sk, &cnt1)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* preparations done */ - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - ret = _test_connect_socket(sk, this_ip_dest, port, timeout); - + ret = test_skpair_connect_poll(sk, this_ip_dest, port, cnt_expected, &sk_pair); synchronize_threads(); /* before counter checks */ if (ret < 0) { + sk_pair = ret; if (fault(KEYREJECT) && ret == -EKEYREJECTED) { test_ok("%s: connect() was prevented", tst_name); } else if (ret == -ETIMEDOUT && fault(TIMEOUT)) { @@ -202,9 +205,11 @@ static void try_connect(const char *tst_name, unsigned int port, else test_ok("%s: connected", tst_name); if (pwd && ret > 0) { - if (test_get_tcp_ao_counters(sk, &ao_cnt2)) - test_error("test_get_tcp_ao_counters()"); - test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); + test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected); + } else if (pwd) { + test_tcp_counters_free(&cnt1); } out: synchronize_threads(); /* close() */ @@ -241,6 +246,11 @@ static void *client_fn(void *arg) try_connect("Wrong rcv id", port++, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100, 0, FAULT_TIMEOUT); + /* + * XXX: The test doesn't increase any counters, see tcp_make_synack(). + * Potentially, it can be speed up by setting sk_pair = -ETIMEDOUT + * but the price would be increased complexity of the tracer thread. + */ trace_ao_event_sk_expect(TCP_AO_SYNACK_NO_KEY, this_ip_dest, addr_any, port, 0, 100, 100); try_connect("Wrong snd id", port++, DEFAULT_TEST_PASSWORD, diff --git a/tools/testing/selftests/net/tcp_ao/connect.c b/tools/testing/selftests/net/tcp_ao/connect.c index f1d8d29e393f..340f00e979ea 100644 --- a/tools/testing/selftests/net/tcp_ao/connect.c +++ b/tools/testing/selftests/net/tcp_ao/connect.c @@ -35,7 +35,7 @@ static void *client_fn(void *arg) uint64_t before_aogood, after_aogood; const size_t nr_packets = 20; struct netstat *ns_before, *ns_after; - struct tcp_ao_counters ao1, ao2; + struct tcp_counters ao1, ao2; if (sk < 0) test_error("socket()"); @@ -50,18 +50,18 @@ static void *client_fn(void *arg) ns_before = netstat_read(); before_aogood = netstat_get(ns_before, "TCPAOGood", NULL); - if (test_get_tcp_ao_counters(sk, &ao1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &ao1)) + test_error("test_get_tcp_counters()"); - if (test_client_verify(sk, 100, nr_packets, TEST_TIMEOUT_SEC)) { + if (test_client_verify(sk, 100, nr_packets)) { test_fail("verify failed"); return NULL; } ns_after = netstat_read(); after_aogood = netstat_get(ns_after, "TCPAOGood", NULL); - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &ao2)) + test_error("test_get_tcp_counters()"); netstat_print_diff(ns_before, ns_after); netstat_free(ns_before); netstat_free(ns_after); @@ -71,14 +71,14 @@ static void *client_fn(void *arg) nr_packets, after_aogood, before_aogood); return NULL; } - if (test_tcp_ao_counters_cmp("connect", &ao1, &ao2, TEST_CNT_GOOD)) + if (test_assert_counters("connect", &ao1, &ao2, TEST_CNT_GOOD)) return NULL; test_ok("connect TCPAOGood %" PRIu64 "/%" PRIu64 "/%" PRIu64 " => %" PRIu64 "/%" PRIu64 "/%" PRIu64 ", sent %zu", - before_aogood, ao1.ao_info_pkt_good, - ao1.key_cnts[0].pkt_good, - after_aogood, ao2.ao_info_pkt_good, - ao2.key_cnts[0].pkt_good, + before_aogood, ao1.ao.ao_info_pkt_good, + ao1.ao.key_cnts[0].pkt_good, + after_aogood, ao2.ao.ao_info_pkt_good, + ao2.ao.key_cnts[0].pkt_good, nr_packets); return NULL; } diff --git a/tools/testing/selftests/net/tcp_ao/icmps-discard.c b/tools/testing/selftests/net/tcp_ao/icmps-discard.c index a1614f0d8c44..85c1a1e958c6 100644 --- a/tools/testing/selftests/net/tcp_ao/icmps-discard.c +++ b/tools/testing/selftests/net/tcp_ao/icmps-discard.c @@ -53,7 +53,7 @@ static void serve_interfered(int sk) ssize_t test_quota = packet_size * packets_nr * 10; uint64_t dest_unreach_a, dest_unreach_b; uint64_t icmp_ignored_a, icmp_ignored_b; - struct tcp_ao_counters ao_cnt1, ao_cnt2; + struct tcp_counters cnt1, cnt2; bool counter_not_found; struct netstat *ns_after, *ns_before; ssize_t bytes; @@ -61,16 +61,16 @@ static void serve_interfered(int sk) ns_before = netstat_read(); dest_unreach_a = netstat_get(ns_before, dst_unreach, NULL); icmp_ignored_a = netstat_get(ns_before, tcpao_icmps, NULL); - if (test_get_tcp_ao_counters(sk, &ao_cnt1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt1)) + test_error("test_get_tcp_counters()"); bytes = test_server_run(sk, test_quota, 0); ns_after = netstat_read(); netstat_print_diff(ns_before, ns_after); dest_unreach_b = netstat_get(ns_after, dst_unreach, NULL); icmp_ignored_b = netstat_get(ns_after, tcpao_icmps, &counter_not_found); - if (test_get_tcp_ao_counters(sk, &ao_cnt2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); netstat_free(ns_before); netstat_free(ns_after); @@ -91,9 +91,9 @@ static void serve_interfered(int sk) return; } #ifdef TEST_ICMPS_ACCEPT - test_tcp_ao_counters_cmp(NULL, &ao_cnt1, &ao_cnt2, TEST_CNT_GOOD); + test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD); #else - test_tcp_ao_counters_cmp(NULL, &ao_cnt1, &ao_cnt2, TEST_CNT_GOOD | TEST_CNT_AO_DROPPED_ICMP); + test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD | TEST_CNT_AO_DROPPED_ICMP); #endif if (icmp_ignored_a >= icmp_ignored_b) { test_icmps_fail("%s counter didn't change: %" PRIu64 " >= %" PRIu64, @@ -395,7 +395,6 @@ static void icmp_interfere(const size_t nr, uint32_t rcv_nxt, void *src, void *d static void send_interfered(int sk) { - const unsigned int timeout = TEST_TIMEOUT_SEC; struct sockaddr_in6 src, dst; socklen_t addr_sz; @@ -409,7 +408,7 @@ static void send_interfered(int sk) while (1) { uint32_t rcv_nxt; - if (test_client_verify(sk, packet_size, packets_nr, timeout)) { + if (test_client_verify(sk, packet_size, packets_nr)) { test_fail("client: connection is broken"); return; } diff --git a/tools/testing/selftests/net/tcp_ao/key-management.c b/tools/testing/selftests/net/tcp_ao/key-management.c index d4385b52c10b..69d9a7a05d5c 100644 --- a/tools/testing/selftests/net/tcp_ao/key-management.c +++ b/tools/testing/selftests/net/tcp_ao/key-management.c @@ -629,11 +629,11 @@ static int key_collection_socket(bool server, unsigned int port) } static void verify_counters(const char *tst_name, bool is_listen_sk, bool server, - struct tcp_ao_counters *a, struct tcp_ao_counters *b) + struct tcp_counters *a, struct tcp_counters *b) { unsigned int i; - __test_tcp_ao_counters_cmp(tst_name, a, b, TEST_CNT_GOOD); + test_assert_counters_sk(tst_name, a, b, TEST_CNT_GOOD); for (i = 0; i < collection.nr_keys; i++) { struct test_key *key = &collection.keys[i]; @@ -652,12 +652,12 @@ static void verify_counters(const char *tst_name, bool is_listen_sk, bool server rx_cnt_expected = key->used_on_server_tx; } - test_tcp_ao_key_counters_cmp(tst_name, a, b, - rx_cnt_expected ? TEST_CNT_KEY_GOOD : 0, - sndid, rcvid); + test_assert_counters_key(tst_name, &a->ao, &b->ao, + rx_cnt_expected ? TEST_CNT_KEY_GOOD : 0, + sndid, rcvid); } - test_tcp_ao_counters_free(a); - test_tcp_ao_counters_free(b); + test_tcp_counters_free(a); + test_tcp_counters_free(b); test_ok("%s: passed counters checks", tst_name); } @@ -791,17 +791,17 @@ out: } static int start_server(const char *tst_name, unsigned int port, size_t quota, - struct tcp_ao_counters *begin, + struct tcp_counters *begin, unsigned int current_index, unsigned int rnext_index) { - struct tcp_ao_counters lsk_c1, lsk_c2; + struct tcp_counters lsk_c1, lsk_c2; ssize_t bytes; int sk, lsk; synchronize_threads(); /* 1: key collection initialized */ lsk = key_collection_socket(true, port); - if (test_get_tcp_ao_counters(lsk, &lsk_c1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(lsk, &lsk_c1)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* 2: MKTs added => connect() */ if (test_wait_fd(lsk, TEST_TIMEOUT_SEC, 0)) test_error("test_wait_fd()"); @@ -809,12 +809,12 @@ static int start_server(const char *tst_name, unsigned int port, size_t quota, sk = accept(lsk, NULL, NULL); if (sk < 0) test_error("accept()"); - if (test_get_tcp_ao_counters(sk, begin)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, begin)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* 3: accepted => send data */ - if (test_get_tcp_ao_counters(lsk, &lsk_c2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(lsk, &lsk_c2)) + test_error("test_get_tcp_counters()"); verify_keys(tst_name, lsk, true, true); close(lsk); @@ -830,12 +830,12 @@ static int start_server(const char *tst_name, unsigned int port, size_t quota, } static void end_server(const char *tst_name, int sk, - struct tcp_ao_counters *begin) + struct tcp_counters *begin) { - struct tcp_ao_counters end; + struct tcp_counters end; - if (test_get_tcp_ao_counters(sk, &end)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &end)) + test_error("test_get_tcp_counters()"); verify_keys(tst_name, sk, false, true); synchronize_threads(); /* 4: verified => closed */ @@ -848,7 +848,7 @@ static void end_server(const char *tst_name, int sk, static void try_server_run(const char *tst_name, unsigned int port, size_t quota, unsigned int current_index, unsigned int rnext_index) { - struct tcp_ao_counters tmp; + struct tcp_counters tmp; int sk; sk = start_server(tst_name, port, quota, &tmp, @@ -860,7 +860,7 @@ static void server_rotations(const char *tst_name, unsigned int port, size_t quota, unsigned int rotations, unsigned int current_index, unsigned int rnext_index) { - struct tcp_ao_counters tmp; + struct tcp_counters tmp; unsigned int i; int sk; @@ -886,7 +886,7 @@ static void server_rotations(const char *tst_name, unsigned int port, static int run_client(const char *tst_name, unsigned int port, unsigned int nr_keys, int current_index, int rnext_index, - struct tcp_ao_counters *before, + struct tcp_counters *before, const size_t msg_sz, const size_t msg_nr) { int sk; @@ -904,8 +904,8 @@ static int run_client(const char *tst_name, unsigned int port, if (test_set_key(sk, sndid, rcvid)) test_error("failed to set current/rnext keys"); } - if (before && test_get_tcp_ao_counters(sk, before)) - test_error("test_get_tcp_ao_counters()"); + if (before && test_get_tcp_counters(sk, before)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* 2: MKTs added => connect() */ if (test_connect_socket(sk, this_ip_dest, port++) <= 0) @@ -918,11 +918,11 @@ static int run_client(const char *tst_name, unsigned int port, collection.keys[rnext_index].used_on_server_tx = 1; synchronize_threads(); /* 3: accepted => send data */ - if (test_client_verify(sk, msg_sz, msg_nr, TEST_TIMEOUT_SEC)) { + if (test_client_verify(sk, msg_sz, msg_nr)) { test_fail("verify failed"); close(sk); if (before) - test_tcp_ao_counters_free(before); + test_tcp_counters_free(before); return -1; } @@ -931,7 +931,7 @@ static int run_client(const char *tst_name, unsigned int port, static int start_client(const char *tst_name, unsigned int port, unsigned int nr_keys, int current_index, int rnext_index, - struct tcp_ao_counters *before, + struct tcp_counters *before, const size_t msg_sz, const size_t msg_nr) { if (init_default_key_collection(nr_keys, true)) @@ -943,9 +943,9 @@ static int start_client(const char *tst_name, unsigned int port, static void end_client(const char *tst_name, int sk, unsigned int nr_keys, int current_index, int rnext_index, - struct tcp_ao_counters *start) + struct tcp_counters *start) { - struct tcp_ao_counters end; + struct tcp_counters end; /* Some application may become dependent on this kernel choice */ if (current_index < 0) @@ -955,8 +955,8 @@ static void end_client(const char *tst_name, int sk, unsigned int nr_keys, verify_current_rnext(tst_name, sk, collection.keys[current_index].client_keyid, collection.keys[rnext_index].server_keyid); - if (start && test_get_tcp_ao_counters(sk, &end)) - test_error("test_get_tcp_ao_counters()"); + if (start && test_get_tcp_counters(sk, &end)) + test_error("test_get_tcp_counters()"); verify_keys(tst_name, sk, false, false); synchronize_threads(); /* 4: verify => closed */ close(sk); @@ -1016,7 +1016,7 @@ static void try_unmatched_keys(int sk, int *rnext_index, unsigned int port) trace_ao_event_expect(TCP_AO_RNEXT_REQUEST, this_ip_addr, this_ip_dest, -1, port, 0, -1, -1, -1, -1, -1, -1, key->server_keyid, -1); - if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) + if (test_client_verify(sk, msg_len, nr_packets)) test_fail("verify failed"); *rnext_index = i; } @@ -1048,7 +1048,7 @@ static void check_current_back(const char *tst_name, unsigned int port, unsigned int current_index, unsigned int rnext_index, unsigned int rotate_to_index) { - struct tcp_ao_counters tmp; + struct tcp_counters tmp; int sk; sk = start_client(tst_name, port, nr_keys, current_index, rnext_index, @@ -1061,7 +1061,7 @@ static void check_current_back(const char *tst_name, unsigned int port, port, -1, 0, -1, -1, -1, -1, -1, collection.keys[rotate_to_index].client_keyid, collection.keys[current_index].client_keyid, -1); - if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) + if (test_client_verify(sk, msg_len, nr_packets)) test_fail("verify failed"); /* There is a race here: between setting the current_key with * setsockopt(TCP_AO_INFO) and starting to send some data - there @@ -1081,7 +1081,7 @@ static void roll_over_keys(const char *tst_name, unsigned int port, unsigned int nr_keys, unsigned int rotations, unsigned int current_index, unsigned int rnext_index) { - struct tcp_ao_counters tmp; + struct tcp_counters tmp; unsigned int i; int sk; @@ -1099,10 +1099,10 @@ static void roll_over_keys(const char *tst_name, unsigned int port, collection.keys[i].server_keyid, -1); if (test_set_key(sk, -1, collection.keys[i].server_keyid)) test_error("Can't change the Rnext key"); - if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) { + if (test_client_verify(sk, msg_len, nr_packets)) { test_fail("verify failed"); close(sk); - test_tcp_ao_counters_free(&tmp); + test_tcp_counters_free(&tmp); return; } verify_current_rnext(tst_name, sk, -1, @@ -1116,7 +1116,7 @@ static void roll_over_keys(const char *tst_name, unsigned int port, static void try_client_run(const char *tst_name, unsigned int port, unsigned int nr_keys, int current_index, int rnext_index) { - struct tcp_ao_counters tmp; + struct tcp_counters tmp; int sk; sk = start_client(tst_name, port, nr_keys, current_index, rnext_index, diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h index 5db2f65cddc4..ebb2899c12fe 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h +++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h @@ -289,7 +289,7 @@ extern int link_set_up(const char *intf); extern const unsigned int test_server_port; extern int test_wait_fd(int sk, time_t sec, bool write); extern int __test_connect_socket(int sk, const char *device, - void *addr, size_t addr_sz, time_t timeout); + void *addr, size_t addr_sz, bool async); extern int __test_listen_socket(int backlog, void *addr, size_t addr_sz); static inline int test_listen_socket(const union tcp_addr taddr, @@ -331,25 +331,26 @@ static inline int test_listen_socket(const union tcp_addr taddr, * If set to 0 - kernel will try to retransmit SYN number of times, set in * /proc/sys/net/ipv4/tcp_syn_retries * By default set to 1 to make tests pass faster on non-busy machine. + * [in process of removal, don't use in new tests] */ #ifndef TEST_RETRANSMIT_SEC #define TEST_RETRANSMIT_SEC 1 #endif static inline int _test_connect_socket(int sk, const union tcp_addr taddr, - unsigned int port, time_t timeout) + unsigned int port, bool async) { sockaddr_af addr; tcp_addr_to_sockaddr_in(&addr, &taddr, htons(port)); return __test_connect_socket(sk, veth_name, - (void *)&addr, sizeof(addr), timeout); + (void *)&addr, sizeof(addr), async); } static inline int test_connect_socket(int sk, const union tcp_addr taddr, unsigned int port) { - return _test_connect_socket(sk, taddr, port, TEST_TIMEOUT_SEC); + return _test_connect_socket(sk, taddr, port, false); } extern int __test_set_md5(int sk, void *addr, size_t addr_sz, @@ -483,10 +484,7 @@ static inline int test_set_ao_flags(int sk, bool ao_required, bool accept_icmps) } extern ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec); -extern ssize_t test_client_loop(int sk, char *buf, size_t buf_sz, - const size_t msg_len, time_t timeout_sec); -extern int test_client_verify(int sk, const size_t msg_len, const size_t nr, - time_t timeout_sec); +extern int test_client_verify(int sk, const size_t msg_len, const size_t nr); struct tcp_ao_key_counters { uint8_t sndid; @@ -512,7 +510,15 @@ struct tcp_ao_counters { size_t nr_keys; struct tcp_ao_key_counters *key_cnts; }; -extern int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out); + +struct tcp_counters { + struct tcp_ao_counters ao; + uint64_t netns_md5_notfound; + uint64_t netns_md5_unexpected; + uint64_t netns_md5_failure; +}; + +extern int test_get_tcp_counters(int sk, struct tcp_counters *out); #define TEST_CNT_KEY_GOOD BIT(0) #define TEST_CNT_KEY_BAD BIT(1) @@ -526,8 +532,31 @@ extern int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out); #define TEST_CNT_NS_KEY_NOT_FOUND BIT(9) #define TEST_CNT_NS_AO_REQUIRED BIT(10) #define TEST_CNT_NS_DROPPED_ICMP BIT(11) +#define TEST_CNT_NS_MD5_NOT_FOUND BIT(12) +#define TEST_CNT_NS_MD5_UNEXPECTED BIT(13) +#define TEST_CNT_NS_MD5_FAILURE BIT(14) typedef uint16_t test_cnt; +#define _for_each_counter(f) \ +do { \ + /* per-netns */ \ + f(ao.netns_ao_good, TEST_CNT_NS_GOOD); \ + f(ao.netns_ao_bad, TEST_CNT_NS_BAD); \ + f(ao.netns_ao_key_not_found, TEST_CNT_NS_KEY_NOT_FOUND); \ + f(ao.netns_ao_required, TEST_CNT_NS_AO_REQUIRED); \ + f(ao.netns_ao_dropped_icmp, TEST_CNT_NS_DROPPED_ICMP); \ + /* per-socket */ \ + f(ao.ao_info_pkt_good, TEST_CNT_SOCK_GOOD); \ + f(ao.ao_info_pkt_bad, TEST_CNT_SOCK_BAD); \ + f(ao.ao_info_pkt_key_not_found, TEST_CNT_SOCK_KEY_NOT_FOUND); \ + f(ao.ao_info_pkt_ao_required, TEST_CNT_SOCK_AO_REQUIRED); \ + f(ao.ao_info_pkt_dropped_icmp, TEST_CNT_SOCK_DROPPED_ICMP); \ + /* non-AO */ \ + f(netns_md5_notfound, TEST_CNT_NS_MD5_NOT_FOUND); \ + f(netns_md5_unexpected, TEST_CNT_NS_MD5_UNEXPECTED); \ + f(netns_md5_failure, TEST_CNT_NS_MD5_FAILURE); \ +} while (0) + #define TEST_CNT_AO_GOOD (TEST_CNT_SOCK_GOOD | TEST_CNT_NS_GOOD) #define TEST_CNT_AO_BAD (TEST_CNT_SOCK_BAD | TEST_CNT_NS_BAD) #define TEST_CNT_AO_KEY_NOT_FOUND (TEST_CNT_SOCK_KEY_NOT_FOUND | \ @@ -539,34 +568,71 @@ typedef uint16_t test_cnt; #define TEST_CNT_GOOD (TEST_CNT_KEY_GOOD | TEST_CNT_AO_GOOD) #define TEST_CNT_BAD (TEST_CNT_KEY_BAD | TEST_CNT_AO_BAD) -extern int __test_tcp_ao_counters_cmp(const char *tst_name, - struct tcp_ao_counters *before, struct tcp_ao_counters *after, +extern test_cnt test_cmp_counters(struct tcp_counters *before, + struct tcp_counters *after); +extern int test_assert_counters_sk(const char *tst_name, + struct tcp_counters *before, struct tcp_counters *after, test_cnt expected); -extern int test_tcp_ao_key_counters_cmp(const char *tst_name, +extern int test_assert_counters_key(const char *tst_name, struct tcp_ao_counters *before, struct tcp_ao_counters *after, test_cnt expected, int sndid, int rcvid); -extern void test_tcp_ao_counters_free(struct tcp_ao_counters *cnts); +extern void test_tcp_counters_free(struct tcp_counters *cnts); + +/* + * Polling for netns and socket counters during select()/connect() and also + * client/server messaging. Instead of constant timeout on underlying select(), + * check the counters and return early. This allows to pass the tests where + * timeout is expected without waiting for that fixing timeout (tests speed-up). + * Previously shorter timeouts were used for tests expecting to time out, + * but that leaded to sporadic false positives on counter checks failures, + * as one second timeouts aren't enough for TCP retransmit. + * + * Two sides of the socketpair (client/server) should synchronize failures + * using a shared variable *err, so that they can detect the other side's + * failure. + */ +extern int test_skpair_wait_poll(int sk, bool write, test_cnt cond, + volatile int *err); +extern int _test_skpair_connect_poll(int sk, const char *device, + void *addr, size_t addr_sz, + test_cnt cond, volatile int *err); +static inline int test_skpair_connect_poll(int sk, const union tcp_addr taddr, + unsigned int port, + test_cnt cond, volatile int *err) +{ + sockaddr_af addr; + + tcp_addr_to_sockaddr_in(&addr, &taddr, htons(port)); + return _test_skpair_connect_poll(sk, veth_name, + (void *)&addr, sizeof(addr), cond, err); +} + +extern int test_skpair_client(int sk, const size_t msg_len, const size_t nr, + test_cnt cond, volatile int *err); +extern int test_skpair_server(int sk, ssize_t quota, + test_cnt cond, volatile int *err); + /* - * Frees buffers allocated in test_get_tcp_ao_counters(). + * Frees buffers allocated in test_get_tcp_counters(). * The function doesn't expect new keys or keys removed between calls - * to test_get_tcp_ao_counters(). Check key counters manually if they + * to test_get_tcp_counters(). Check key counters manually if they * may change. */ -static inline int test_tcp_ao_counters_cmp(const char *tst_name, - struct tcp_ao_counters *before, - struct tcp_ao_counters *after, - test_cnt expected) +static inline int test_assert_counters(const char *tst_name, + struct tcp_counters *before, + struct tcp_counters *after, + test_cnt expected) { int ret; - ret = __test_tcp_ao_counters_cmp(tst_name, before, after, expected); + ret = test_assert_counters_sk(tst_name, before, after, expected); if (ret) goto out; - ret = test_tcp_ao_key_counters_cmp(tst_name, before, after, - expected, -1, -1); + ret = test_assert_counters_key(tst_name, &before->ao, &after->ao, + expected, -1, -1); out: - test_tcp_ao_counters_free(before); - test_tcp_ao_counters_free(after); + test_tcp_counters_free(before); + test_tcp_counters_free(after); return ret; } diff --git a/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c index 24380c68fec6..27403f875054 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c +++ b/tools/testing/selftests/net/tcp_ao/lib/ftrace-tcp.c @@ -427,11 +427,8 @@ static void dump_trace_event(struct expected_trace_point *e) test_print("trace event filter %s [%s:%d => %s:%d, L3index %d, flags: %s%s%s%s%s, keyid: %d, rnext: %d, maclen: %d, sne: %d] = %zu", trace_event_names[e->type], src, e->src_port, dst, e->dst_port, e->L3index, - (e->fin > 0) ? "F" : (e->fin == 0) ? "!F" : "", - (e->syn > 0) ? "S" : (e->syn == 0) ? "!S" : "", - (e->rst > 0) ? "R" : (e->rst == 0) ? "!R" : "", - (e->psh > 0) ? "P" : (e->psh == 0) ? "!P" : "", - (e->ack > 0) ? "." : (e->ack == 0) ? "!." : "", + e->fin ? "F" : "", e->syn ? "S" : "", e->rst ? "R" : "", + e->psh ? "P" : "", e->ack ? "." : "", e->keyid, e->rnext, e->maclen, e->sne, e->matched); } diff --git a/tools/testing/selftests/net/tcp_ao/lib/sock.c b/tools/testing/selftests/net/tcp_ao/lib/sock.c index 0ffda966c677..ef8e9031d47a 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/sock.c +++ b/tools/testing/selftests/net/tcp_ao/lib/sock.c @@ -34,10 +34,8 @@ int __test_listen_socket(int backlog, void *addr, size_t addr_sz) return sk; } -int test_wait_fd(int sk, time_t sec, bool write) +static int __test_wait_fd(int sk, struct timeval *tv, bool write) { - struct timeval tv = { .tv_sec = sec }; - struct timeval *ptv = NULL; fd_set fds, efds; int ret; socklen_t slen = sizeof(ret); @@ -47,14 +45,11 @@ int test_wait_fd(int sk, time_t sec, bool write) FD_ZERO(&efds); FD_SET(sk, &efds); - if (sec) - ptv = &tv; - errno = 0; if (write) - ret = select(sk + 1, NULL, &fds, &efds, ptv); + ret = select(sk + 1, NULL, &fds, &efds, tv); else - ret = select(sk + 1, &fds, NULL, &efds, ptv); + ret = select(sk + 1, &fds, NULL, &efds, tv); if (ret < 0) return -errno; if (ret == 0) { @@ -69,8 +64,54 @@ int test_wait_fd(int sk, time_t sec, bool write) return 0; } +int test_wait_fd(int sk, time_t sec, bool write) +{ + struct timeval tv = { .tv_sec = sec, }; + + return __test_wait_fd(sk, sec ? &tv : NULL, write); +} + +static bool __skpair_poll_should_stop(int sk, struct tcp_counters *c, + test_cnt condition) +{ + struct tcp_counters c2; + test_cnt diff; + + if (test_get_tcp_counters(sk, &c2)) + test_error("test_get_tcp_counters()"); + + diff = test_cmp_counters(c, &c2); + test_tcp_counters_free(&c2); + return (diff & condition) == condition; +} + +/* How often wake up and check netns counters & paired (*err) */ +#define POLL_USEC 150 +static int __test_skpair_poll(int sk, bool write, uint64_t timeout, + struct tcp_counters *c, test_cnt cond, + volatile int *err) +{ + uint64_t t; + + for (t = 0; t <= timeout * 1000000; t += POLL_USEC) { + struct timeval tv = { .tv_usec = POLL_USEC, }; + int ret; + + ret = __test_wait_fd(sk, &tv, write); + if (ret != -ETIMEDOUT) + return ret; + if (c && cond && __skpair_poll_should_stop(sk, c, cond)) + break; + if (err && *err) + return *err; + } + if (err) + *err = -ETIMEDOUT; + return -ETIMEDOUT; +} + int __test_connect_socket(int sk, const char *device, - void *addr, size_t addr_sz, time_t timeout) + void *addr, size_t addr_sz, bool async) { long flags; int err; @@ -82,15 +123,6 @@ int __test_connect_socket(int sk, const char *device, test_error("setsockopt(SO_BINDTODEVICE, %s)", device); } - if (!timeout) { - err = connect(sk, addr, addr_sz); - if (err) { - err = -errno; - goto out; - } - return 0; - } - flags = fcntl(sk, F_GETFL); if ((flags < 0) || (fcntl(sk, F_SETFL, flags | O_NONBLOCK) < 0)) test_error("fcntl()"); @@ -100,9 +132,9 @@ int __test_connect_socket(int sk, const char *device, err = -errno; goto out; } - if (timeout < 0) + if (async) return sk; - err = test_wait_fd(sk, timeout, 1); + err = test_wait_fd(sk, TEST_TIMEOUT_SEC, 1); if (err) goto out; } @@ -113,6 +145,45 @@ out: return err; } +int test_skpair_wait_poll(int sk, bool write, + test_cnt cond, volatile int *err) +{ + struct tcp_counters c; + int ret; + + *err = 0; + if (test_get_tcp_counters(sk, &c)) + test_error("test_get_tcp_counters()"); + synchronize_threads(); /* 1: init skpair & read nscounters */ + + ret = __test_skpair_poll(sk, write, TEST_TIMEOUT_SEC, &c, cond, err); + test_tcp_counters_free(&c); + return ret; +} + +int _test_skpair_connect_poll(int sk, const char *device, + void *addr, size_t addr_sz, + test_cnt condition, volatile int *err) +{ + struct tcp_counters c; + int ret; + + *err = 0; + if (test_get_tcp_counters(sk, &c)) + test_error("test_get_tcp_counters()"); + synchronize_threads(); /* 1: init skpair & read nscounters */ + ret = __test_connect_socket(sk, device, addr, addr_sz, true); + if (ret < 0) { + test_tcp_counters_free(&c); + return (*err = ret); + } + ret = __test_skpair_poll(sk, 1, TEST_TIMEOUT_SEC, &c, condition, err); + if (ret < 0) + close(sk); + test_tcp_counters_free(&c); + return ret; +} + int __test_set_md5(int sk, void *addr, size_t addr_sz, uint8_t prefix, int vrf, const char *password) { @@ -333,12 +404,12 @@ do { \ return 0; } -int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out) +int test_get_tcp_counters(int sk, struct tcp_counters *out) { struct tcp_ao_getsockopt *key_dump; socklen_t key_dump_sz = sizeof(*key_dump); struct tcp_ao_info_opt info = {}; - bool c1, c2, c3, c4, c5; + bool c1, c2, c3, c4, c5, c6, c7, c8; struct netstat *ns; int err, nr_keys; @@ -346,25 +417,30 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out) /* per-netns */ ns = netstat_read(); - out->netns_ao_good = netstat_get(ns, "TCPAOGood", &c1); - out->netns_ao_bad = netstat_get(ns, "TCPAOBad", &c2); - out->netns_ao_key_not_found = netstat_get(ns, "TCPAOKeyNotFound", &c3); - out->netns_ao_required = netstat_get(ns, "TCPAORequired", &c4); - out->netns_ao_dropped_icmp = netstat_get(ns, "TCPAODroppedIcmps", &c5); + out->ao.netns_ao_good = netstat_get(ns, "TCPAOGood", &c1); + out->ao.netns_ao_bad = netstat_get(ns, "TCPAOBad", &c2); + out->ao.netns_ao_key_not_found = netstat_get(ns, "TCPAOKeyNotFound", &c3); + out->ao.netns_ao_required = netstat_get(ns, "TCPAORequired", &c4); + out->ao.netns_ao_dropped_icmp = netstat_get(ns, "TCPAODroppedIcmps", &c5); + out->netns_md5_notfound = netstat_get(ns, "TCPMD5NotFound", &c6); + out->netns_md5_unexpected = netstat_get(ns, "TCPMD5Unexpected", &c7); + out->netns_md5_failure = netstat_get(ns, "TCPMD5Failure", &c8); netstat_free(ns); - if (c1 || c2 || c3 || c4 || c5) + if (c1 || c2 || c3 || c4 || c5 || c6 || c7 || c8) return -EOPNOTSUPP; err = test_get_ao_info(sk, &info); + if (err == -ENOENT) + return 0; if (err) return err; /* per-socket */ - out->ao_info_pkt_good = info.pkt_good; - out->ao_info_pkt_bad = info.pkt_bad; - out->ao_info_pkt_key_not_found = info.pkt_key_not_found; - out->ao_info_pkt_ao_required = info.pkt_ao_required; - out->ao_info_pkt_dropped_icmp = info.pkt_dropped_icmp; + out->ao.ao_info_pkt_good = info.pkt_good; + out->ao.ao_info_pkt_bad = info.pkt_bad; + out->ao.ao_info_pkt_key_not_found = info.pkt_key_not_found; + out->ao.ao_info_pkt_ao_required = info.pkt_ao_required; + out->ao.ao_info_pkt_dropped_icmp = info.pkt_dropped_icmp; /* per-key */ nr_keys = test_get_ao_keys_nr(sk); @@ -372,7 +448,7 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out) return nr_keys; if (nr_keys == 0) test_error("test_get_ao_keys_nr() == 0"); - out->nr_keys = (size_t)nr_keys; + out->ao.nr_keys = (size_t)nr_keys; key_dump = calloc(nr_keys, key_dump_sz); if (!key_dump) return -errno; @@ -386,72 +462,84 @@ int test_get_tcp_ao_counters(int sk, struct tcp_ao_counters *out) return -errno; } - out->key_cnts = calloc(nr_keys, sizeof(out->key_cnts[0])); - if (!out->key_cnts) { + out->ao.key_cnts = calloc(nr_keys, sizeof(out->ao.key_cnts[0])); + if (!out->ao.key_cnts) { free(key_dump); return -errno; } while (nr_keys--) { - out->key_cnts[nr_keys].sndid = key_dump[nr_keys].sndid; - out->key_cnts[nr_keys].rcvid = key_dump[nr_keys].rcvid; - out->key_cnts[nr_keys].pkt_good = key_dump[nr_keys].pkt_good; - out->key_cnts[nr_keys].pkt_bad = key_dump[nr_keys].pkt_bad; + out->ao.key_cnts[nr_keys].sndid = key_dump[nr_keys].sndid; + out->ao.key_cnts[nr_keys].rcvid = key_dump[nr_keys].rcvid; + out->ao.key_cnts[nr_keys].pkt_good = key_dump[nr_keys].pkt_good; + out->ao.key_cnts[nr_keys].pkt_bad = key_dump[nr_keys].pkt_bad; } free(key_dump); return 0; } -int __test_tcp_ao_counters_cmp(const char *tst_name, - struct tcp_ao_counters *before, - struct tcp_ao_counters *after, - test_cnt expected) +test_cnt test_cmp_counters(struct tcp_counters *before, + struct tcp_counters *after) +{ +#define __cmp(cnt, e_cnt) \ +do { \ + if (before->cnt > after->cnt) \ + test_error("counter " __stringify(cnt) " decreased"); \ + if (before->cnt != after->cnt) \ + ret |= e_cnt; \ +} while (0) + + test_cnt ret = 0; + size_t i; + + if (before->ao.nr_keys != after->ao.nr_keys) + test_error("the number of keys has changed"); + + _for_each_counter(__cmp); + + i = before->ao.nr_keys; + while (i--) { + __cmp(ao.key_cnts[i].pkt_good, TEST_CNT_KEY_GOOD); + __cmp(ao.key_cnts[i].pkt_bad, TEST_CNT_KEY_BAD); + } +#undef __cmp + return ret; +} + +int test_assert_counters_sk(const char *tst_name, + struct tcp_counters *before, + struct tcp_counters *after, + test_cnt expected) { -#define __cmp_ao(cnt, expecting_inc) \ +#define __cmp_ao(cnt, e_cnt) \ do { \ if (before->cnt > after->cnt) { \ test_fail("%s: Decreased counter " __stringify(cnt) " %" PRIu64 " > %" PRIu64, \ - tst_name ?: "", before->cnt, after->cnt); \ + tst_name ?: "", before->cnt, after->cnt); \ return -1; \ } \ - if ((before->cnt != after->cnt) != (expecting_inc)) { \ + if ((before->cnt != after->cnt) != !!(expected & e_cnt)) { \ test_fail("%s: Counter " __stringify(cnt) " was %sexpected to increase %" PRIu64 " => %" PRIu64, \ - tst_name ?: "", (expecting_inc) ? "" : "not ", \ + tst_name ?: "", (expected & e_cnt) ? "" : "not ", \ before->cnt, after->cnt); \ return -1; \ } \ -} while(0) +} while (0) errno = 0; - /* per-netns */ - __cmp_ao(netns_ao_good, !!(expected & TEST_CNT_NS_GOOD)); - __cmp_ao(netns_ao_bad, !!(expected & TEST_CNT_NS_BAD)); - __cmp_ao(netns_ao_key_not_found, - !!(expected & TEST_CNT_NS_KEY_NOT_FOUND)); - __cmp_ao(netns_ao_required, !!(expected & TEST_CNT_NS_AO_REQUIRED)); - __cmp_ao(netns_ao_dropped_icmp, - !!(expected & TEST_CNT_NS_DROPPED_ICMP)); - /* per-socket */ - __cmp_ao(ao_info_pkt_good, !!(expected & TEST_CNT_SOCK_GOOD)); - __cmp_ao(ao_info_pkt_bad, !!(expected & TEST_CNT_SOCK_BAD)); - __cmp_ao(ao_info_pkt_key_not_found, - !!(expected & TEST_CNT_SOCK_KEY_NOT_FOUND)); - __cmp_ao(ao_info_pkt_ao_required, !!(expected & TEST_CNT_SOCK_AO_REQUIRED)); - __cmp_ao(ao_info_pkt_dropped_icmp, - !!(expected & TEST_CNT_SOCK_DROPPED_ICMP)); + _for_each_counter(__cmp_ao); return 0; #undef __cmp_ao } -int test_tcp_ao_key_counters_cmp(const char *tst_name, - struct tcp_ao_counters *before, - struct tcp_ao_counters *after, - test_cnt expected, - int sndid, int rcvid) +int test_assert_counters_key(const char *tst_name, + struct tcp_ao_counters *before, + struct tcp_ao_counters *after, + test_cnt expected, int sndid, int rcvid) { size_t i; -#define __cmp_ao(i, cnt, expecting_inc) \ +#define __cmp_ao(i, cnt, e_cnt) \ do { \ if (before->key_cnts[i].cnt > after->key_cnts[i].cnt) { \ test_fail("%s: Decreased counter " __stringify(cnt) " %" PRIu64 " > %" PRIu64 " for key %u:%u", \ @@ -461,16 +549,16 @@ do { \ before->key_cnts[i].rcvid); \ return -1; \ } \ - if ((before->key_cnts[i].cnt != after->key_cnts[i].cnt) != (expecting_inc)) { \ + if ((before->key_cnts[i].cnt != after->key_cnts[i].cnt) != !!(expected & e_cnt)) { \ test_fail("%s: Counter " __stringify(cnt) " was %sexpected to increase %" PRIu64 " => %" PRIu64 " for key %u:%u", \ - tst_name ?: "", (expecting_inc) ? "" : "not ",\ + tst_name ?: "", (expected & e_cnt) ? "" : "not ",\ before->key_cnts[i].cnt, \ after->key_cnts[i].cnt, \ before->key_cnts[i].sndid, \ before->key_cnts[i].rcvid); \ return -1; \ } \ -} while(0) +} while (0) if (before->nr_keys != after->nr_keys) { test_fail("%s: Keys changed on the socket %zu != %zu", @@ -485,20 +573,22 @@ do { \ continue; if (rcvid >= 0 && before->key_cnts[i].rcvid != rcvid) continue; - __cmp_ao(i, pkt_good, !!(expected & TEST_CNT_KEY_GOOD)); - __cmp_ao(i, pkt_bad, !!(expected & TEST_CNT_KEY_BAD)); + __cmp_ao(i, pkt_good, TEST_CNT_KEY_GOOD); + __cmp_ao(i, pkt_bad, TEST_CNT_KEY_BAD); } return 0; #undef __cmp_ao } -void test_tcp_ao_counters_free(struct tcp_ao_counters *cnts) +void test_tcp_counters_free(struct tcp_counters *cnts) { - free(cnts->key_cnts); + free(cnts->ao.key_cnts); } #define TEST_BUF_SIZE 4096 -ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec) +static ssize_t _test_server_run(int sk, ssize_t quota, struct tcp_counters *c, + test_cnt cond, volatile int *err, + time_t timeout_sec) { ssize_t total = 0; @@ -507,7 +597,7 @@ ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec) ssize_t bytes, sent; int ret; - ret = test_wait_fd(sk, timeout_sec, 0); + ret = __test_skpair_poll(sk, 0, timeout_sec, c, cond, err); if (ret) return ret; @@ -518,7 +608,7 @@ ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec) if (bytes == 0) break; - ret = test_wait_fd(sk, timeout_sec, 1); + ret = __test_skpair_poll(sk, 1, timeout_sec, c, cond, err); if (ret) return ret; @@ -533,13 +623,41 @@ ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec) return total; } -ssize_t test_client_loop(int sk, char *buf, size_t buf_sz, - const size_t msg_len, time_t timeout_sec) +ssize_t test_server_run(int sk, ssize_t quota, time_t timeout_sec) +{ + return _test_server_run(sk, quota, NULL, 0, NULL, + timeout_sec ?: TEST_TIMEOUT_SEC); +} + +int test_skpair_server(int sk, ssize_t quota, test_cnt cond, volatile int *err) +{ + struct tcp_counters c; + ssize_t ret; + + *err = 0; + if (test_get_tcp_counters(sk, &c)) + test_error("test_get_tcp_counters()"); + synchronize_threads(); /* 1: init skpair & read nscounters */ + + ret = _test_server_run(sk, quota, &c, cond, err, TEST_TIMEOUT_SEC); + test_tcp_counters_free(&c); + return ret; +} + +static ssize_t test_client_loop(int sk, size_t buf_sz, const size_t msg_len, + struct tcp_counters *c, test_cnt cond, + volatile int *err) { char msg[msg_len]; int nodelay = 1; + char *buf; size_t i; + buf = alloca(buf_sz); + if (!buf) + return -ENOMEM; + randomize_buffer(buf, buf_sz); + if (setsockopt(sk, IPPROTO_TCP, TCP_NODELAY, &nodelay, sizeof(nodelay))) test_error("setsockopt(TCP_NODELAY)"); @@ -547,7 +665,7 @@ ssize_t test_client_loop(int sk, char *buf, size_t buf_sz, size_t sent, bytes = min(msg_len, buf_sz - i); int ret; - ret = test_wait_fd(sk, timeout_sec, 1); + ret = __test_skpair_poll(sk, 1, TEST_TIMEOUT_SEC, c, cond, err); if (ret) return ret; @@ -561,7 +679,8 @@ ssize_t test_client_loop(int sk, char *buf, size_t buf_sz, do { ssize_t got; - ret = test_wait_fd(sk, timeout_sec, 0); + ret = __test_skpair_poll(sk, 0, TEST_TIMEOUT_SEC, + c, cond, err); if (ret) return ret; @@ -580,15 +699,31 @@ ssize_t test_client_loop(int sk, char *buf, size_t buf_sz, return i; } -int test_client_verify(int sk, const size_t msg_len, const size_t nr, - time_t timeout_sec) +int test_client_verify(int sk, const size_t msg_len, const size_t nr) { size_t buf_sz = msg_len * nr; - char *buf = alloca(buf_sz); ssize_t ret; - randomize_buffer(buf, buf_sz); - ret = test_client_loop(sk, buf, buf_sz, msg_len, timeout_sec); + ret = test_client_loop(sk, buf_sz, msg_len, NULL, 0, NULL); + if (ret < 0) + return (int)ret; + return ret != buf_sz ? -1 : 0; +} + +int test_skpair_client(int sk, const size_t msg_len, const size_t nr, + test_cnt cond, volatile int *err) +{ + struct tcp_counters c; + size_t buf_sz = msg_len * nr; + ssize_t ret; + + *err = 0; + if (test_get_tcp_counters(sk, &c)) + test_error("test_get_tcp_counters()"); + synchronize_threads(); /* 1: init skpair & read nscounters */ + + ret = test_client_loop(sk, buf_sz, msg_len, &c, cond, err); + test_tcp_counters_free(&c); if (ret < 0) return (int)ret; return ret != buf_sz ? -1 : 0; diff --git a/tools/testing/selftests/net/tcp_ao/restore.c b/tools/testing/selftests/net/tcp_ao/restore.c index ecc6f1e3a414..9a059b6c4523 100644 --- a/tools/testing/selftests/net/tcp_ao/restore.c +++ b/tools/testing/selftests/net/tcp_ao/restore.c @@ -16,11 +16,11 @@ const size_t quota = nr_packets * msg_len; static void try_server_run(const char *tst_name, unsigned int port, fault_t inj, test_cnt cnt_expected) { + test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected; const char *cnt_name = "TCPAOGood"; - struct tcp_ao_counters ao1, ao2; + struct tcp_counters cnt1, cnt2; uint64_t before_cnt, after_cnt; - int sk, lsk; - time_t timeout; + int sk, lsk, dummy; ssize_t bytes; if (fault(TIMEOUT)) @@ -48,11 +48,10 @@ static void try_server_run(const char *tst_name, unsigned int port, } before_cnt = netstat_get_one(cnt_name, NULL); - if (test_get_tcp_ao_counters(sk, &ao1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt1)) + test_error("test_get_tcp_counters()"); - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - bytes = test_server_run(sk, quota, timeout); + bytes = test_skpair_server(sk, quota, poll_cnt, &dummy); if (fault(TIMEOUT)) { if (bytes > 0) test_fail("%s: server served: %zd", tst_name, bytes); @@ -65,17 +64,17 @@ static void try_server_run(const char *tst_name, unsigned int port, test_ok("%s: server alive", tst_name); } synchronize_threads(); /* 3: counters checks */ - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); after_cnt = netstat_get_one(cnt_name, NULL); - test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected); + test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected); if (after_cnt <= before_cnt) { - test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, - tst_name, cnt_name, after_cnt, before_cnt); + test_fail("%s(server): %s counter did not increase: %" PRIu64 " <= %" PRIu64, + tst_name, cnt_name, after_cnt, before_cnt); } else { - test_ok("%s: counter %s increased %" PRIu64 " => %" PRIu64, + test_ok("%s(server): counter %s increased %" PRIu64 " => %" PRIu64, tst_name, cnt_name, before_cnt, after_cnt); } @@ -92,16 +91,16 @@ static void *server_fn(void *arg) { unsigned int port = test_server_port; - try_server_run("TCP-AO migrate to another socket", port++, + try_server_run("TCP-AO migrate to another socket (server)", port++, 0, TEST_CNT_GOOD); - try_server_run("TCP-AO with wrong send ISN", port++, + try_server_run("TCP-AO with wrong send ISN (server)", port++, FAULT_TIMEOUT, TEST_CNT_BAD); - try_server_run("TCP-AO with wrong receive ISN", port++, + try_server_run("TCP-AO with wrong receive ISN (server)", port++, FAULT_TIMEOUT, TEST_CNT_BAD); - try_server_run("TCP-AO with wrong send SEQ ext number", port++, + try_server_run("TCP-AO with wrong send SEQ ext number (server)", port++, FAULT_TIMEOUT, TEST_CNT_BAD); - try_server_run("TCP-AO with wrong receive SEQ ext number", port++, - FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD); + try_server_run("TCP-AO with wrong receive SEQ ext number (server)", + port++, FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD); synchronize_threads(); /* don't race to exit: client exits */ return NULL; @@ -125,7 +124,7 @@ static void test_get_sk_checkpoint(unsigned int server_port, sockaddr_af *saddr, test_error("failed to connect()"); synchronize_threads(); /* 2: accepted => send data */ - if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) + if (test_client_verify(sk, msg_len, nr_packets)) test_fail("pre-migrate verify failed"); test_enable_repair(sk); @@ -139,11 +138,11 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, struct tcp_ao_repair *ao_img, fault_t inj, test_cnt cnt_expected) { + test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected; const char *cnt_name = "TCPAOGood"; - struct tcp_ao_counters ao1, ao2; + struct tcp_counters cnt1, cnt2; uint64_t before_cnt, after_cnt; - time_t timeout; - int sk; + int sk, dummy; if (fault(TIMEOUT)) cnt_name = "TCPAOBad"; @@ -159,30 +158,30 @@ static void test_sk_restore(const char *tst_name, unsigned int server_port, test_error("setsockopt(TCP_AO_ADD_KEY)"); test_ao_restore(sk, ao_img); - if (test_get_tcp_ao_counters(sk, &ao1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt1)) + test_error("test_get_tcp_counters()"); test_disable_repair(sk); test_sock_state_free(img); - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - if (test_client_verify(sk, msg_len, nr_packets, timeout)) { + if (test_skpair_client(sk, msg_len, nr_packets, poll_cnt, &dummy)) { if (fault(TIMEOUT)) test_ok("%s: post-migrate connection is broken", tst_name); else test_fail("%s: post-migrate connection is working", tst_name); } else { if (fault(TIMEOUT)) - test_fail("%s: post-migrate connection still working", tst_name); + test_fail("%s: post-migrate connection is working", tst_name); else test_ok("%s: post-migrate connection is alive", tst_name); } + synchronize_threads(); /* 3: counters checks */ - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); after_cnt = netstat_get_one(cnt_name, NULL); - test_tcp_ao_counters_cmp(tst_name, &ao1, &ao2, cnt_expected); + test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected); if (after_cnt <= before_cnt) { test_fail("%s: %s counter did not increase: %" PRIu64 " <= %" PRIu64, @@ -203,7 +202,7 @@ static void *client_fn(void *arg) sockaddr_af saddr; test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); - test_sk_restore("TCP-AO migrate to another socket", port++, + test_sk_restore("TCP-AO migrate to another socket (client)", port++, &saddr, &tcp_img, &ao_img, 0, TEST_CNT_GOOD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); @@ -212,7 +211,7 @@ static void *client_fn(void *arg) -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); - test_sk_restore("TCP-AO with wrong send ISN", port++, + test_sk_restore("TCP-AO with wrong send ISN (client)", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); @@ -221,7 +220,7 @@ static void *client_fn(void *arg) -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); - test_sk_restore("TCP-AO with wrong receive ISN", port++, + test_sk_restore("TCP-AO with wrong receive ISN (client)", port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_BAD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); @@ -229,8 +228,8 @@ static void *client_fn(void *arg) trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_addr, this_ip_dest, -1, port, 0, -1, -1, -1, -1, -1, 100, 100, -1); /* not expecting server => client mismatches as only snd sne is broken */ - test_sk_restore("TCP-AO with wrong send SEQ ext number", port++, - &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, + test_sk_restore("TCP-AO with wrong send SEQ ext number (client)", + port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_NS_BAD | TEST_CNT_GOOD); test_get_sk_checkpoint(port, &saddr, &tcp_img, &ao_img); @@ -238,8 +237,8 @@ static void *client_fn(void *arg) /* not expecting client => server mismatches as only rcv sne is broken */ trace_ao_event_expect(TCP_AO_MISMATCH, this_ip_dest, this_ip_addr, port, -1, 0, -1, -1, -1, -1, -1, 100, 100, -1); - test_sk_restore("TCP-AO with wrong receive SEQ ext number", port++, - &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, + test_sk_restore("TCP-AO with wrong receive SEQ ext number (client)", + port++, &saddr, &tcp_img, &ao_img, FAULT_TIMEOUT, TEST_CNT_NS_GOOD | TEST_CNT_BAD); return NULL; diff --git a/tools/testing/selftests/net/tcp_ao/rst.c b/tools/testing/selftests/net/tcp_ao/rst.c index 6364facaa63e..883cddf377cf 100644 --- a/tools/testing/selftests/net/tcp_ao/rst.c +++ b/tools/testing/selftests/net/tcp_ao/rst.c @@ -84,15 +84,15 @@ static void close_forced(int sk) static void test_server_active_rst(unsigned int port) { - struct tcp_ao_counters cnt1, cnt2; + struct tcp_counters cnt1, cnt2; ssize_t bytes; int sk, lsk; lsk = test_listen_socket(this_ip_addr, port, backlog); if (test_add_key(lsk, DEFAULT_TEST_PASSWORD, this_ip_dest, -1, 100, 100)) test_error("setsockopt(TCP_AO_ADD_KEY)"); - if (test_get_tcp_ao_counters(lsk, &cnt1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(lsk, &cnt1)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* 1: MKT added */ if (test_wait_fd(lsk, TEST_TIMEOUT_SEC, 0)) @@ -103,8 +103,8 @@ static void test_server_active_rst(unsigned int port) test_error("accept()"); synchronize_threads(); /* 2: connection accept()ed, another queued */ - if (test_get_tcp_ao_counters(lsk, &cnt2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(lsk, &cnt2)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* 3: close listen socket */ close(lsk); @@ -120,7 +120,7 @@ static void test_server_active_rst(unsigned int port) synchronize_threads(); /* 5: closed active sk */ synchronize_threads(); /* 6: counters checks */ - if (test_tcp_ao_counters_cmp("active RST server", &cnt1, &cnt2, TEST_CNT_GOOD)) + if (test_assert_counters("active RST server", &cnt1, &cnt2, TEST_CNT_GOOD)) test_fail("MKT counters (server) have not only good packets"); else test_ok("MKT counters are good on server"); @@ -128,7 +128,7 @@ static void test_server_active_rst(unsigned int port) static void test_server_passive_rst(unsigned int port) { - struct tcp_ao_counters ao1, ao2; + struct tcp_counters cnt1, cnt2; int sk, lsk; ssize_t bytes; @@ -147,8 +147,8 @@ static void test_server_passive_rst(unsigned int port) synchronize_threads(); /* 2: accepted => send data */ close(lsk); - if (test_get_tcp_ao_counters(sk, &ao1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt1)) + test_error("test_get_tcp_counters()"); bytes = test_server_run(sk, quota, TEST_TIMEOUT_SEC); if (bytes != quota) { @@ -160,12 +160,12 @@ static void test_server_passive_rst(unsigned int port) synchronize_threads(); /* 3: checkpoint the client */ synchronize_threads(); /* 4: close the server, creating twsk */ - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); close(sk); synchronize_threads(); /* 5: restore the socket, send more data */ - test_tcp_ao_counters_cmp("passive RST server", &ao1, &ao2, TEST_CNT_GOOD); + test_assert_counters("passive RST server", &cnt1, &cnt2, TEST_CNT_GOOD); synchronize_threads(); /* 6: server exits */ } @@ -271,8 +271,7 @@ static void test_client_active_rst(unsigned int port) synchronize_threads(); /* 1: MKT added */ for (i = 0; i < last; i++) { - err = _test_connect_socket(sk[i], this_ip_dest, port, - (i == 0) ? TEST_TIMEOUT_SEC : -1); + err = _test_connect_socket(sk[i], this_ip_dest, port, i != 0); if (err < 0) test_error("failed to connect()"); } @@ -283,12 +282,12 @@ static void test_client_active_rst(unsigned int port) test_error("test_wait_fds(): %d", err); /* async connect() with third sk to get into request_sock_queue */ - err = _test_connect_socket(sk[last], this_ip_dest, port, -1); + err = _test_connect_socket(sk[last], this_ip_dest, port, 1); if (err < 0) test_error("failed to connect()"); synchronize_threads(); /* 3: close listen socket */ - if (test_client_verify(sk[0], packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) + if (test_client_verify(sk[0], packet_sz, quota / packet_sz)) test_fail("Failed to send data on connected socket"); else test_ok("Verified established tcp connection"); @@ -323,7 +322,7 @@ static void test_client_active_rst(unsigned int port) static void test_client_passive_rst(unsigned int port) { - struct tcp_ao_counters ao1, ao2; + struct tcp_counters cnt1, cnt2; struct tcp_ao_repair ao_img; struct tcp_sock_state img; sockaddr_af saddr; @@ -341,7 +340,7 @@ static void test_client_passive_rst(unsigned int port) test_error("failed to connect()"); synchronize_threads(); /* 2: accepted => send data */ - if (test_client_verify(sk, packet_sz, quota / packet_sz, TEST_TIMEOUT_SEC)) + if (test_client_verify(sk, packet_sz, quota / packet_sz)) test_fail("Failed to send data on connected socket"); else test_ok("Verified established tcp connection"); @@ -397,8 +396,8 @@ static void test_client_passive_rst(unsigned int port) test_error("setsockopt(TCP_AO_ADD_KEY)"); test_ao_restore(sk, &ao_img); - if (test_get_tcp_ao_counters(sk, &ao1)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt1)) + test_error("test_get_tcp_counters()"); test_disable_repair(sk); test_sock_state_free(&img); @@ -417,7 +416,7 @@ static void test_client_passive_rst(unsigned int port) * IP 10.0.254.1.7011 > 10.0.1.1.59772: Flags [R], seq 3215596252, win 0, * options [tcp-ao keyid 100 rnextkeyid 100 mac 0x0bcfbbf497bce844312304b2], length 0 */ - err = test_client_verify(sk, packet_sz, quota / packet_sz, 2 * TEST_TIMEOUT_SEC); + err = test_client_verify(sk, packet_sz, quota / packet_sz); /* Make sure that the connection was reset, not timeouted */ if (err && err == -ECONNRESET) test_ok("client sock was passively reset post-seq-adjust"); @@ -426,12 +425,12 @@ static void test_client_passive_rst(unsigned int port) else test_fail("client sock is yet connected post-seq-adjust"); - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* 6: server exits */ close(sk); - test_tcp_ao_counters_cmp("client passive RST", &ao1, &ao2, TEST_CNT_GOOD); + test_assert_counters("client passive RST", &cnt1, &cnt2, TEST_CNT_GOOD); } static void *client_fn(void *arg) diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c index 3ecd2b58de6a..73b2f2276f3f 100644 --- a/tools/testing/selftests/net/tcp_ao/self-connect.c +++ b/tools/testing/selftests/net/tcp_ao/self-connect.c @@ -30,7 +30,7 @@ static void setup_lo_intf(const char *lo_intf) static void tcp_self_connect(const char *tst, unsigned int port, bool different_keyids, bool check_restore) { - struct tcp_ao_counters before_ao, after_ao; + struct tcp_counters before, after; uint64_t before_aogood, after_aogood; struct netstat *ns_before, *ns_after; const size_t nr_packets = 20; @@ -60,17 +60,17 @@ static void tcp_self_connect(const char *tst, unsigned int port, ns_before = netstat_read(); before_aogood = netstat_get(ns_before, "TCPAOGood", NULL); - if (test_get_tcp_ao_counters(sk, &before_ao)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &before)) + test_error("test_get_tcp_counters()"); if (__test_connect_socket(sk, "lo", (struct sockaddr *)&addr, - sizeof(addr), TEST_TIMEOUT_SEC) < 0) { + sizeof(addr), 0) < 0) { ns_after = netstat_read(); netstat_print_diff(ns_before, ns_after); test_error("failed to connect()"); } - if (test_client_verify(sk, 100, nr_packets, TEST_TIMEOUT_SEC)) { + if (test_client_verify(sk, 100, nr_packets)) { test_fail("%s: tcp connection verify failed", tst); close(sk); return; @@ -78,8 +78,8 @@ static void tcp_self_connect(const char *tst, unsigned int port, ns_after = netstat_read(); after_aogood = netstat_get(ns_after, "TCPAOGood", NULL); - if (test_get_tcp_ao_counters(sk, &after_ao)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &after)) + test_error("test_get_tcp_counters()"); if (!check_restore) { /* to debug: netstat_print_diff(ns_before, ns_after); */ netstat_free(ns_before); @@ -93,7 +93,7 @@ static void tcp_self_connect(const char *tst, unsigned int port, return; } - if (test_tcp_ao_counters_cmp(tst, &before_ao, &after_ao, TEST_CNT_GOOD)) { + if (test_assert_counters(tst, &before, &after, TEST_CNT_GOOD)) { close(sk); return; } @@ -136,7 +136,7 @@ static void tcp_self_connect(const char *tst, unsigned int port, test_ao_restore(sk, &ao_img); test_disable_repair(sk); test_sock_state_free(&img); - if (test_client_verify(sk, 100, nr_packets, TEST_TIMEOUT_SEC)) { + if (test_client_verify(sk, 100, nr_packets)) { test_fail("%s: tcp connection verify failed", tst); close(sk); return; diff --git a/tools/testing/selftests/net/tcp_ao/seq-ext.c b/tools/testing/selftests/net/tcp_ao/seq-ext.c index 8901a6785dc8..f00245263b20 100644 --- a/tools/testing/selftests/net/tcp_ao/seq-ext.c +++ b/tools/testing/selftests/net/tcp_ao/seq-ext.c @@ -40,7 +40,7 @@ static void test_adjust_seqs(struct tcp_sock_state *img, static int test_sk_restore(struct tcp_sock_state *img, struct tcp_ao_repair *ao_img, sockaddr_af *saddr, const union tcp_addr daddr, unsigned int dport, - struct tcp_ao_counters *cnt) + struct tcp_counters *cnt) { int sk; @@ -54,8 +54,8 @@ static int test_sk_restore(struct tcp_sock_state *img, test_error("setsockopt(TCP_AO_ADD_KEY)"); test_ao_restore(sk, ao_img); - if (test_get_tcp_ao_counters(sk, cnt)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, cnt)) + test_error("test_get_tcp_counters()"); test_disable_repair(sk); test_sock_state_free(img); @@ -65,7 +65,7 @@ static int test_sk_restore(struct tcp_sock_state *img, static void *server_fn(void *arg) { uint64_t before_good, after_good, after_bad; - struct tcp_ao_counters ao1, ao2; + struct tcp_counters cnt1, cnt2; struct tcp_sock_state img; struct tcp_ao_repair ao_img; sockaddr_af saddr; @@ -114,7 +114,7 @@ static void *server_fn(void *arg) test_adjust_seqs(&img, &ao_img, true); synchronize_threads(); /* 4: dump finished */ sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, - client_new_port, &ao1); + client_new_port, &cnt1); trace_ao_event_sne_expect(TCP_AO_SND_SNE_UPDATE, this_ip_addr, this_ip_dest, test_server_port + 1, client_new_port, 1); @@ -136,11 +136,11 @@ static void *server_fn(void *arg) } synchronize_threads(); /* 6: verify counters after SEQ-number rollover */ - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); after_good = netstat_get_one("TCPAOGood", NULL); - test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD); + test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD); if (after_good <= before_good) { test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64, @@ -173,7 +173,7 @@ out: static void *client_fn(void *arg) { uint64_t before_good, after_good, after_bad; - struct tcp_ao_counters ao1, ao2; + struct tcp_counters cnt1, cnt2; struct tcp_sock_state img; struct tcp_ao_repair ao_img; sockaddr_af saddr; @@ -191,7 +191,7 @@ static void *client_fn(void *arg) test_error("failed to connect()"); synchronize_threads(); /* 2: accepted => send data */ - if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) { + if (test_client_verify(sk, msg_len, nr_packets)) { test_fail("pre-migrate verify failed"); return NULL; } @@ -213,20 +213,20 @@ static void *client_fn(void *arg) test_adjust_seqs(&img, &ao_img, false); synchronize_threads(); /* 4: dump finished */ sk = test_sk_restore(&img, &ao_img, &saddr, this_ip_dest, - test_server_port + 1, &ao1); + test_server_port + 1, &cnt1); synchronize_threads(); /* 5: verify the connection during SEQ-number rollover */ - if (test_client_verify(sk, msg_len, nr_packets, TEST_TIMEOUT_SEC)) + if (test_client_verify(sk, msg_len, nr_packets)) test_fail("post-migrate verify failed"); else test_ok("post-migrate connection alive"); synchronize_threads(); /* 5: verify counters after SEQ-number rollover */ - if (test_get_tcp_ao_counters(sk, &ao2)) - test_error("test_get_tcp_ao_counters()"); + if (test_get_tcp_counters(sk, &cnt2)) + test_error("test_get_tcp_counters()"); after_good = netstat_get_one("TCPAOGood", NULL); - test_tcp_ao_counters_cmp(NULL, &ao1, &ao2, TEST_CNT_GOOD); + test_assert_counters(NULL, &cnt1, &cnt2, TEST_CNT_GOOD); if (after_good <= before_good) { test_fail("TCPAOGood counter did not increase: %" PRIu64 " <= %" PRIu64, diff --git a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c index f779e5892bc1..a1467b64390a 100644 --- a/tools/testing/selftests/net/tcp_ao/unsigned-md5.c +++ b/tools/testing/selftests/net/tcp_ao/unsigned-md5.c @@ -6,6 +6,7 @@ #define fault(type) (inj == FAULT_ ## type) static const char *md5_password = "Some evil genius, enemy to mankind, must have been the first contriver."; static const char *ao_password = DEFAULT_TEST_PASSWORD; +static volatile int sk_pair; static union tcp_addr client2; static union tcp_addr client3; @@ -41,10 +42,10 @@ static void try_accept(const char *tst_name, unsigned int port, const char *cnt_name, test_cnt cnt_expected, int needs_tcp_md5, fault_t inj) { - struct tcp_ao_counters ao_cnt1, ao_cnt2; + struct tcp_counters cnt1, cnt2; uint64_t before_cnt = 0, after_cnt = 0; /* silence GCC */ - int lsk, err, sk = 0; - time_t timeout; + test_cnt poll_cnt = (cnt_expected == TEST_CNT_GOOD) ? 0 : cnt_expected; + int lsk, err, sk = -1; if (needs_tcp_md5 && should_skip_test(tst_name, KCONFIG_TCP_MD5)) return; @@ -63,22 +64,25 @@ static void try_accept(const char *tst_name, unsigned int port, if (cnt_name) before_cnt = netstat_get_one(cnt_name, NULL); - if (ao_addr && test_get_tcp_ao_counters(lsk, &ao_cnt1)) - test_error("test_get_tcp_ao_counters()"); + if (ao_addr && test_get_tcp_counters(lsk, &cnt1)) + test_error("test_get_tcp_counters()"); synchronize_threads(); /* preparations done */ - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - err = test_wait_fd(lsk, timeout, 0); + err = test_skpair_wait_poll(lsk, 0, poll_cnt, &sk_pair); synchronize_threads(); /* connect()/accept() timeouts */ if (err == -ETIMEDOUT) { + sk_pair = err; if (!fault(TIMEOUT)) - test_fail("timed out for accept()"); + test_fail("%s: timed out for accept()", tst_name); + } else if (err == -EKEYREJECTED) { + if (!fault(KEYREJECT)) + test_fail("%s: key was rejected", tst_name); } else if (err < 0) { - test_error("test_wait_fd()"); + test_error("test_skpair_wait_poll()"); } else { if (fault(TIMEOUT)) - test_fail("ready to accept"); + test_fail("%s: ready to accept", tst_name); sk = accept(lsk, NULL, NULL); if (sk < 0) { @@ -89,8 +93,8 @@ static void try_accept(const char *tst_name, unsigned int port, } } - if (ao_addr && test_get_tcp_ao_counters(lsk, &ao_cnt2)) - test_error("test_get_tcp_ao_counters()"); + if (ao_addr && test_get_tcp_counters(lsk, &cnt2)) + test_error("test_get_tcp_counters()"); close(lsk); if (!cnt_name) { @@ -108,11 +112,11 @@ static void try_accept(const char *tst_name, unsigned int port, tst_name, cnt_name, before_cnt, after_cnt); } if (ao_addr) - test_tcp_ao_counters_cmp(tst_name, &ao_cnt1, &ao_cnt2, cnt_expected); + test_assert_counters(tst_name, &cnt1, &cnt2, cnt_expected); out: synchronize_threads(); /* test_kill_sk() */ - if (sk > 0) + if (sk >= 0) test_kill_sk(sk); } @@ -153,78 +157,82 @@ static void *server_fn(void *arg) server_add_routes(); - try_accept("AO server (INADDR_ANY): AO client", port++, NULL, 0, + try_accept("[server] AO server (INADDR_ANY): AO client", port++, NULL, 0, &addr_any, 0, 0, 100, 100, 0, "TCPAOGood", TEST_CNT_GOOD, 0, 0); - try_accept("AO server (INADDR_ANY): MD5 client", port++, NULL, 0, + try_accept("[server] AO server (INADDR_ANY): MD5 client", port++, NULL, 0, &addr_any, 0, 0, 100, 100, 0, "TCPMD5Unexpected", - 0, 1, FAULT_TIMEOUT); - try_accept("AO server (INADDR_ANY): no sign client", port++, NULL, 0, + TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT); + try_accept("[server] AO server (INADDR_ANY): no sign client", port++, NULL, 0, &addr_any, 0, 0, 100, 100, 0, "TCPAORequired", TEST_CNT_AO_REQUIRED, 0, FAULT_TIMEOUT); - try_accept("AO server (AO_REQUIRED): AO client", port++, NULL, 0, + try_accept("[server] AO server (AO_REQUIRED): AO client", port++, NULL, 0, &this_ip_dest, TEST_PREFIX, true, 100, 100, 0, "TCPAOGood", TEST_CNT_GOOD, 0, 0); - try_accept("AO server (AO_REQUIRED): unsigned client", port++, NULL, 0, + try_accept("[server] AO server (AO_REQUIRED): unsigned client", port++, NULL, 0, &this_ip_dest, TEST_PREFIX, true, 100, 100, 0, "TCPAORequired", TEST_CNT_AO_REQUIRED, 0, FAULT_TIMEOUT); - try_accept("MD5 server (INADDR_ANY): AO client", port++, &addr_any, 0, + try_accept("[server] MD5 server (INADDR_ANY): AO client", port++, &addr_any, 0, NULL, 0, 0, 0, 0, 0, "TCPAOKeyNotFound", - 0, 1, FAULT_TIMEOUT); - try_accept("MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0, + TEST_CNT_NS_KEY_NOT_FOUND, 1, FAULT_TIMEOUT); + try_accept("[server] MD5 server (INADDR_ANY): MD5 client", port++, &addr_any, 0, NULL, 0, 0, 0, 0, 0, NULL, 0, 1, 0); - try_accept("MD5 server (INADDR_ANY): no sign client", port++, &addr_any, + try_accept("[server] MD5 server (INADDR_ANY): no sign client", port++, &addr_any, 0, NULL, 0, 0, 0, 0, 0, "TCPMD5NotFound", - 0, 1, FAULT_TIMEOUT); + TEST_CNT_NS_MD5_NOT_FOUND, 1, FAULT_TIMEOUT); - try_accept("no sign server: AO client", port++, NULL, 0, + try_accept("[server] no sign server: AO client", port++, NULL, 0, NULL, 0, 0, 0, 0, 0, "TCPAOKeyNotFound", - TEST_CNT_AO_KEY_NOT_FOUND, 0, FAULT_TIMEOUT); - try_accept("no sign server: MD5 client", port++, NULL, 0, + TEST_CNT_NS_KEY_NOT_FOUND, 0, FAULT_TIMEOUT); + try_accept("[server] no sign server: MD5 client", port++, NULL, 0, NULL, 0, 0, 0, 0, 0, "TCPMD5Unexpected", - 0, 1, FAULT_TIMEOUT); - try_accept("no sign server: no sign client", port++, NULL, 0, + TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT); + try_accept("[server] no sign server: no sign client", port++, NULL, 0, NULL, 0, 0, 0, 0, 0, "CurrEstab", 0, 0, 0); - try_accept("AO+MD5 server: AO client (matching)", port++, + try_accept("[server] AO+MD5 server: AO client (matching)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, 100, 100, 0, "TCPAOGood", TEST_CNT_GOOD, 1, 0); - try_accept("AO+MD5 server: AO client (misconfig, matching MD5)", port++, + try_accept("[server] AO+MD5 server: AO client (misconfig, matching MD5)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, 100, 100, 0, "TCPAOKeyNotFound", TEST_CNT_AO_KEY_NOT_FOUND, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: AO client (misconfig, non-matching)", port++, + try_accept("[server] AO+MD5 server: AO client (misconfig, non-matching)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, 100, 100, 0, "TCPAOKeyNotFound", TEST_CNT_AO_KEY_NOT_FOUND, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: MD5 client (matching)", port++, + try_accept("[server] AO+MD5 server: MD5 client (matching)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, 100, 100, 0, NULL, 0, 1, 0); - try_accept("AO+MD5 server: MD5 client (misconfig, matching AO)", port++, + try_accept("[server] AO+MD5 server: MD5 client (misconfig, matching AO)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, - 100, 100, 0, "TCPMD5Unexpected", 0, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: MD5 client (misconfig, non-matching)", port++, + 100, 100, 0, "TCPMD5Unexpected", + TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT); + try_accept("[server] AO+MD5 server: MD5 client (misconfig, non-matching)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, - 100, 100, 0, "TCPMD5Unexpected", 0, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: no sign client (unmatched)", port++, + 100, 100, 0, "TCPMD5Unexpected", + TEST_CNT_NS_MD5_UNEXPECTED, 1, FAULT_TIMEOUT); + try_accept("[server] AO+MD5 server: no sign client (unmatched)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, 100, 100, 0, "CurrEstab", 0, 1, 0); - try_accept("AO+MD5 server: no sign client (misconfig, matching AO)", + try_accept("[server] AO+MD5 server: no sign client (misconfig, matching AO)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, 100, 100, 0, "TCPAORequired", TEST_CNT_AO_REQUIRED, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: no sign client (misconfig, matching MD5)", + try_accept("[server] AO+MD5 server: no sign client (misconfig, matching MD5)", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, - 100, 100, 0, "TCPMD5NotFound", 0, 1, FAULT_TIMEOUT); + 100, 100, 0, "TCPMD5NotFound", + TEST_CNT_NS_MD5_NOT_FOUND, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: client with both [TCP-MD5] and TCP-AO keys", + /* Key rejected by the other side, failing short through skpair */ + try_accept("[server] AO+MD5 server: client with both [TCP-MD5] and TCP-AO keys", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, - 100, 100, 0, NULL, 0, 1, FAULT_TIMEOUT); - try_accept("AO+MD5 server: client with both TCP-MD5 and [TCP-AO] keys", + 100, 100, 0, NULL, 0, 1, FAULT_KEYREJECT); + try_accept("[server] AO+MD5 server: client with both TCP-MD5 and [TCP-AO] keys", port++, &this_ip_dest, TEST_PREFIX, &client2, TEST_PREFIX, 0, - 100, 100, 0, NULL, 0, 1, FAULT_TIMEOUT); + 100, 100, 0, NULL, 0, 1, FAULT_KEYREJECT); server_add_fail_tests(&port); @@ -259,7 +267,6 @@ static void try_connect(const char *tst_name, unsigned int port, uint8_t sndid, uint8_t rcvid, uint8_t vrf, fault_t inj, int needs_tcp_md5, union tcp_addr *bind_addr) { - time_t timeout; int sk, ret; if (needs_tcp_md5 && should_skip_test(tst_name, KCONFIG_TCP_MD5)) @@ -281,11 +288,10 @@ static void try_connect(const char *tst_name, unsigned int port, synchronize_threads(); /* preparations done */ - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - ret = _test_connect_socket(sk, this_ip_dest, port, timeout); - + ret = test_skpair_connect_poll(sk, this_ip_dest, port, 0, &sk_pair); synchronize_threads(); /* connect()/accept() timeouts */ if (ret < 0) { + sk_pair = ret; if (fault(KEYREJECT) && ret == -EKEYREJECTED) test_ok("%s: connect() was prevented", tst_name); else if (ret == -ETIMEDOUT && fault(TIMEOUT)) @@ -305,8 +311,7 @@ static void try_connect(const char *tst_name, unsigned int port, out: synchronize_threads(); /* test_kill_sk() */ - /* _test_connect_socket() cleans up on failure */ - if (ret > 0) + if (ret > 0) /* test_skpair_connect_poll() cleans up on failure */ test_kill_sk(sk); } @@ -437,7 +442,6 @@ static void try_to_add(const char *tst_name, unsigned int port, int ao_vrf, uint8_t sndid, uint8_t rcvid, int needs_tcp_md5, fault_t inj) { - time_t timeout; int sk, ret; if (needs_tcp_md5 && should_skip_test(tst_name, KCONFIG_TCP_MD5)) @@ -450,11 +454,10 @@ static void try_to_add(const char *tst_name, unsigned int port, synchronize_threads(); /* preparations done */ - timeout = fault(TIMEOUT) ? TEST_RETRANSMIT_SEC : TEST_TIMEOUT_SEC; - ret = _test_connect_socket(sk, this_ip_dest, port, timeout); + ret = test_skpair_connect_poll(sk, this_ip_dest, port, 0, &sk_pair); synchronize_threads(); /* connect()/accept() timeouts */ - if (ret <= 0) { + if (ret < 0) { test_error("%s: connect() returned %d", tst_name, ret); goto out; } @@ -490,8 +493,7 @@ static void try_to_add(const char *tst_name, unsigned int port, out: synchronize_threads(); /* test_kill_sk() */ - /* _test_connect_socket() cleans up on failure */ - if (ret > 0) + if (ret > 0) /* test_skpair_connect_poll() cleans up on failure */ test_kill_sk(sk); } diff --git a/tools/testing/selftests/net/test_blackhole_dev.sh b/tools/testing/selftests/net/test_blackhole_dev.sh deleted file mode 100755 index 3119b80e711f..000000000000 --- a/tools/testing/selftests/net/test_blackhole_dev.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# Runs blackhole-dev test using blackhole-dev kernel module - -if /sbin/modprobe -q test_blackhole_dev ; then - /sbin/modprobe -q -r test_blackhole_dev; - echo "test_blackhole_dev: ok"; -else - echo "test_blackhole_dev: [FAIL]"; - exit 1; -fi diff --git a/tools/testing/selftests/net/test_so_rcv.sh b/tools/testing/selftests/net/test_so_rcv.sh new file mode 100755 index 000000000000..d8aa4362879d --- /dev/null +++ b/tools/testing/selftests/net/test_so_rcv.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +HOSTS=("127.0.0.1" "::1") +PORT=1234 +TOTAL_TESTS=0 +FAILED_TESTS=0 + +declare -A TESTS=( + ["SO_RCVPRIORITY"]="-P 2" + ["SO_RCVMARK"]="-M 3" +) + +check_result() { + ((TOTAL_TESTS++)) + if [ "$1" -ne 0 ]; then + ((FAILED_TESTS++)) + fi +} + +cleanup() +{ + cleanup_ns $NS +} + +trap cleanup EXIT + +setup_ns NS + +for HOST in "${HOSTS[@]}"; do + PROTOCOL="IPv4" + if [[ "$HOST" == "::1" ]]; then + PROTOCOL="IPv6" + fi + for test_name in "${!TESTS[@]}"; do + echo "Running $test_name test, $PROTOCOL" + arg=${TESTS[$test_name]} + + ip netns exec $NS ./so_rcv_listener $arg $HOST $PORT & + LISTENER_PID=$! + + sleep 0.5 + + if ! ip netns exec $NS ./cmsg_sender $arg $HOST $PORT; then + echo "Sender failed for $test_name, $PROTOCOL" + kill "$LISTENER_PID" 2>/dev/null + wait "$LISTENER_PID" + check_result 1 + continue + fi + + wait "$LISTENER_PID" + LISTENER_EXIT_CODE=$? + + if [ "$LISTENER_EXIT_CODE" -eq 0 ]; then + echo "Rcv test OK for $test_name, $PROTOCOL" + check_result 0 + else + echo "Rcv test FAILED for $test_name, $PROTOCOL" + check_result 1 + fi + done +done + +if [ "$FAILED_TESTS" -ne 0 ]; then + echo "FAIL - $FAILED_TESTS/$TOTAL_TESTS tests failed" + exit ${KSFT_FAIL} +else + echo "OK - All $TOTAL_TESTS tests passed" + exit ${KSFT_PASS} +fi diff --git a/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh b/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh index 2d442cdab11e..062f957950af 100755 --- a/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh +++ b/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh @@ -1,29 +1,114 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -# Check FDB default-remote handling across "ip link set". +ALL_TESTS=" + test_set_remote + test_change_mc_remote +" +source lib.sh check_remotes() { local what=$1; shift local N=$(bridge fdb sh dev vx | grep 00:00:00:00:00:00 | wc -l) - echo -ne "expected two remotes after $what\t" - if [[ $N != 2 ]]; then - echo "[FAIL]" - EXIT_STATUS=1 + ((N == 2)) + check_err $? "expected 2 remotes after $what, got $N" +} + +# Check FDB default-remote handling across "ip link set". +test_set_remote() +{ + RET=0 + + ip_link_add vx up type vxlan id 2000 dstport 4789 + bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent + bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent + check_remotes "fdb append" + + ip link set dev vx type vxlan remote 192.0.2.30 + check_remotes "link set" + + log_test 'FDB default-remote handling across "ip link set"' +} + +fmt_remote() +{ + local addr=$1; shift + + if [[ $addr == 224.* ]]; then + echo "group $addr" else - echo "[ OK ]" + echo "remote $addr" fi } -ip link add name vx up type vxlan id 2000 dstport 4789 -bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent -bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent -check_remotes "fdb append" +change_remote() +{ + local remote=$1; shift + + ip link set dev vx type vxlan $(fmt_remote $remote) dev v1 +} + +check_membership() +{ + local check_vec=("$@") + + local memberships + memberships=$( + netstat -n --groups | + sed -n '/^v1\b/p' | + grep -o '[^ ]*$' + ) + check_err $? "Couldn't obtain group memberships" + + local item + for item in "${check_vec[@]}"; do + eval "local $item" + echo "$memberships" | grep -q "\b$group\b" + check_err_fail $fail $? "$group is_ex reported in IGMP query response" + done +} + +test_change_mc_remote() +{ + check_command netstat || return + + ip_link_add v1 up type veth peer name v2 + ip_link_set_up v2 + + RET=0 + + ip_link_add vx up type vxlan dstport 4789 \ + local 192.0.2.1 $(fmt_remote 224.1.1.1) dev v1 vni 1000 + + check_membership "group=224.1.1.1 fail=0" \ + "group=224.1.1.2 fail=1" \ + "group=224.1.1.3 fail=1" + + log_test "MC group report after VXLAN creation" + + RET=0 + + change_remote 224.1.1.2 + check_membership "group=224.1.1.1 fail=1" \ + "group=224.1.1.2 fail=0" \ + "group=224.1.1.3 fail=1" + + log_test "MC group report after changing VXLAN remote MC->MC" + + RET=0 + + change_remote 192.0.2.2 + check_membership "group=224.1.1.1 fail=1" \ + "group=224.1.1.2 fail=1" \ + "group=224.1.1.3 fail=1" + + log_test "MC group report after changing VXLAN remote MC->UC" +} + +trap defer_scopes_cleanup EXIT -ip link set dev vx type vxlan remote 192.0.2.30 -check_remotes "link set" +tests_run -ip link del dev vx exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/ynl.mk b/tools/testing/selftests/net/ynl.mk index 12e7cae251be..e907c2751956 100644 --- a/tools/testing/selftests/net/ynl.mk +++ b/tools/testing/selftests/net/ynl.mk @@ -27,7 +27,8 @@ $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig: $(OUTPUT)/libynl.a: $(YNL_SPECS) $(OUTPUT)/.libynl-$(YNL_GENS_HASH).sig $(Q)rm -f $(top_srcdir)/tools/net/ynl/libynl.a - $(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl GENS="$(YNL_GENS)" libynl.a + $(Q)$(MAKE) -C $(top_srcdir)/tools/net/ynl \ + GENS="$(YNL_GENS)" RSTS="" libynl.a $(Q)cp $(top_srcdir)/tools/net/ynl/libynl.a $(OUTPUT)/libynl.a EXTRA_CLEAN += \ diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7d14a7c0cb62..58bcbbd029bc 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -47,6 +47,7 @@ XARCH_riscv = riscv64 XARCH = $(or $(XARCH_$(ARCH)),$(ARCH)) # map from user input variants to their kernel supported architectures +ARCH_armthumb = arm ARCH_ppc = powerpc ARCH_ppc64 = powerpc ARCH_ppc64le = powerpc @@ -54,6 +55,7 @@ ARCH_mips32le = mips ARCH_mips32be = mips ARCH_riscv32 = riscv ARCH_riscv64 = riscv +ARCH_s390x = s390 ARCH := $(or $(ARCH_$(XARCH)),$(XARCH)) # kernel image names by architecture @@ -62,6 +64,7 @@ IMAGE_x86_64 = arch/x86/boot/bzImage IMAGE_x86 = arch/x86/boot/bzImage IMAGE_arm64 = arch/arm64/boot/Image IMAGE_arm = arch/arm/boot/zImage +IMAGE_armthumb = arch/arm/boot/zImage IMAGE_mips32le = vmlinuz IMAGE_mips32be = vmlinuz IMAGE_ppc = vmlinux @@ -70,6 +73,7 @@ IMAGE_ppc64le = arch/powerpc/boot/zImage IMAGE_riscv = arch/riscv/boot/Image IMAGE_riscv32 = arch/riscv/boot/Image IMAGE_riscv64 = arch/riscv/boot/Image +IMAGE_s390x = arch/s390/boot/bzImage IMAGE_s390 = arch/s390/boot/bzImage IMAGE_loongarch = arch/loongarch/boot/vmlinuz.efi IMAGE = $(objtree)/$(IMAGE_$(XARCH)) @@ -81,19 +85,20 @@ DEFCONFIG_x86_64 = defconfig DEFCONFIG_x86 = defconfig DEFCONFIG_arm64 = defconfig DEFCONFIG_arm = multi_v7_defconfig +DEFCONFIG_armthumb = multi_v7_defconfig DEFCONFIG_mips32le = malta_defconfig -DEFCONFIG_mips32be = malta_defconfig +DEFCONFIG_mips32be = malta_defconfig generic/eb.config DEFCONFIG_ppc = pmac32_defconfig DEFCONFIG_ppc64 = powernv_be_defconfig DEFCONFIG_ppc64le = powernv_defconfig DEFCONFIG_riscv = defconfig DEFCONFIG_riscv32 = rv32_defconfig DEFCONFIG_riscv64 = defconfig -DEFCONFIG_s390 = defconfig +DEFCONFIG_s390x = defconfig +DEFCONFIG_s390 = defconfig compat.config DEFCONFIG_loongarch = defconfig DEFCONFIG = $(DEFCONFIG_$(XARCH)) -EXTRACONFIG_mips32be = -d CONFIG_CPU_LITTLE_ENDIAN -e CONFIG_CPU_BIG_ENDIAN EXTRACONFIG = $(EXTRACONFIG_$(XARCH)) # optional tests to run (default = all) @@ -105,6 +110,7 @@ QEMU_ARCH_x86_64 = x86_64 QEMU_ARCH_x86 = x86_64 QEMU_ARCH_arm64 = aarch64 QEMU_ARCH_arm = arm +QEMU_ARCH_armthumb = arm QEMU_ARCH_mips32le = mipsel # works with malta_defconfig QEMU_ARCH_mips32be = mips QEMU_ARCH_ppc = ppc @@ -113,6 +119,7 @@ QEMU_ARCH_ppc64le = ppc64 QEMU_ARCH_riscv = riscv64 QEMU_ARCH_riscv32 = riscv32 QEMU_ARCH_riscv64 = riscv64 +QEMU_ARCH_s390x = s390x QEMU_ARCH_s390 = s390x QEMU_ARCH_loongarch = loongarch64 QEMU_ARCH = $(QEMU_ARCH_$(XARCH)) @@ -133,6 +140,7 @@ QEMU_ARGS_x86_64 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $( QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_armthumb = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_mips32le = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_mips32be = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_ppc = -M g3beige -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" @@ -141,6 +149,7 @@ QEMU_ARGS_ppc64le = -M powernv -append "console=hvc0 panic=-1 $(TEST:%=NOLIBC QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv32 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_riscv64 = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_s390x = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_s390 = -M s390-ccw-virtio -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS_loongarch = -M virt -append "console=ttyS0,115200 panic=-1 $(TEST:%=NOLIBC_TEST=%)" QEMU_ARGS = -m 1G $(QEMU_ARGS_$(XARCH)) $(QEMU_ARGS_BIOS) $(QEMU_ARGS_EXTRA) @@ -156,15 +165,18 @@ Q=@ endif CFLAGS_i386 = $(call cc-option,-m32) +CFLAGS_arm = -marm +CFLAGS_armthumb = -mthumb -march=armv6t2 CFLAGS_ppc = -m32 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64 = -m64 -mbig-endian -mno-vsx $(call cc-option,-mmultiple) CFLAGS_ppc64le = -m64 -mlittle-endian -mno-vsx $(call cc-option,-mabi=elfv2) -CFLAGS_s390 = -m64 +CFLAGS_s390x = -m64 +CFLAGS_s390 = -m31 CFLAGS_mips32le = -EL -mabi=32 -fPIC CFLAGS_mips32be = -EB -mabi=32 CFLAGS_STACKPROTECTOR ?= $(call cc-option,-mstack-protector-guard=global $(call cc-option,-fstack-protector-all)) CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables -std=c89 -W -Wall -Wextra \ - $(call cc-option,-fno-stack-protector) \ + $(call cc-option,-fno-stack-protector) $(call cc-option,-Wmissing-prototypes) \ $(CFLAGS_$(XARCH)) $(CFLAGS_STACKPROTECTOR) $(CFLAGS_EXTRA) LDFLAGS := @@ -220,7 +232,7 @@ all: run sysroot: sysroot/$(ARCH)/include -sysroot/$(ARCH)/include: +sysroot/$(ARCH)/include: | defconfig $(Q)rm -rf sysroot/$(ARCH) sysroot/sysroot $(QUIET_MKDIR)mkdir -p sysroot $(Q)$(MAKE) -C $(srctree) outputmakefile @@ -264,16 +276,16 @@ initramfs: nolibc-test $(Q)cp nolibc-test initramfs/init defconfig: - $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(DEFCONFIG) $(Q)if [ -n "$(EXTRACONFIG)" ]; then \ $(srctree)/scripts/config --file $(objtree)/.config $(EXTRACONFIG); \ $(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) olddefconfig < /dev/null; \ fi -kernel: +kernel: | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) < /dev/null -kernel-standalone: initramfs +kernel-standalone: initramfs | defconfig $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs < /dev/null # run the tests after building the kernel diff --git a/tools/testing/selftests/nolibc/nolibc-test-linkage.c b/tools/testing/selftests/nolibc/nolibc-test-linkage.c index 5ff4c8a1db2a..a7ca8325863f 100644 --- a/tools/testing/selftests/nolibc/nolibc-test-linkage.c +++ b/tools/testing/selftests/nolibc/nolibc-test-linkage.c @@ -11,16 +11,16 @@ void *linkage_test_errno_addr(void) return &errno; } -int linkage_test_constructor_test_value; +int linkage_test_constructor_test_value = 0; __attribute__((constructor)) static void constructor1(void) { - linkage_test_constructor_test_value = 2; + linkage_test_constructor_test_value |= 1 << 0; } __attribute__((constructor)) static void constructor2(void) { - linkage_test_constructor_test_value *= 3; + linkage_test_constructor_test_value |= 1 << 1; } diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 0e0e3b48a8c3..5884a891c491 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -43,6 +43,8 @@ #endif #endif +#pragma GCC diagnostic ignored "-Wmissing-prototypes" + #include "nolibc-test-linkage.h" /* for the type of int_fast16_t and int_fast32_t, musl differs from glibc and nolibc */ @@ -690,14 +692,14 @@ int expect_strtox(int llen, void *func, const char *input, int base, intmax_t ex __attribute__((constructor)) static void constructor1(void) { - constructor_test_value = 1; + constructor_test_value |= 1 << 0; } __attribute__((constructor)) static void constructor2(int argc, char **argv, char **envp) { if (argc && argv && envp) - constructor_test_value *= 2; + constructor_test_value |= 1 << 1; } int run_startup(int min, int max) @@ -736,9 +738,9 @@ int run_startup(int min, int max) CASE_TEST(environ_HOME); EXPECT_PTRNZ(1, getenv("HOME")); break; CASE_TEST(auxv_addr); EXPECT_PTRGT(test_auxv != (void *)-1, test_auxv, brk); break; CASE_TEST(auxv_AT_UID); EXPECT_EQ(1, getauxval(AT_UID), getuid()); break; - CASE_TEST(constructor); EXPECT_EQ(1, constructor_test_value, 2); break; + CASE_TEST(constructor); EXPECT_EQ(is_nolibc, constructor_test_value, 0x3); break; CASE_TEST(linkage_errno); EXPECT_PTREQ(1, linkage_test_errno_addr(), &errno); break; - CASE_TEST(linkage_constr); EXPECT_EQ(1, linkage_test_constructor_test_value, 6); break; + CASE_TEST(linkage_constr); EXPECT_EQ(1, linkage_test_constructor_test_value, 0x3); break; case __LINE__: return ret; /* must be last */ /* note: do not set any defaults so as to permit holes above */ @@ -767,6 +769,44 @@ int test_getdents64(const char *dir) return ret; } +static int test_dirent(void) +{ + int comm = 0, cmdline = 0; + struct dirent dirent, *result; + DIR *dir; + int ret; + + dir = opendir("/proc/self"); + if (!dir) + return 1; + + while (1) { + errno = 0; + ret = readdir_r(dir, &dirent, &result); + if (ret != 0) + return 1; + if (!result) + break; + + if (strcmp(dirent.d_name, "comm") == 0) + comm++; + else if (strcmp(dirent.d_name, "cmdline") == 0) + cmdline++; + } + + if (errno) + return 1; + + ret = closedir(dir); + if (ret) + return 1; + + if (comm != 1 || cmdline != 1) + return 1; + + return 0; +} + int test_getpagesize(void) { int x = getpagesize(); @@ -988,6 +1028,22 @@ int test_rlimit(void) return 0; } +int test_openat(void) +{ + int dev, null; + + dev = openat(AT_FDCWD, "/dev", O_DIRECTORY); + if (dev < 0) + return -1; + + null = openat(dev, "null", O_RDONLY); + close(dev); + if (null < 0) + return -1; + + close(null); + return 0; +} /* Run syscall tests between IDs <min> and <max>. * Return 0 on success, non-zero on failure. @@ -1059,6 +1115,7 @@ int run_syscall(int min, int max) CASE_TEST(fork); EXPECT_SYSZR(1, test_fork()); break; CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; + CASE_TEST(directories); EXPECT_SYSZR(proc, test_dirent()); break; CASE_TEST(gettimeofday_tv); EXPECT_SYSZR(1, gettimeofday(&tv, NULL)); break; CASE_TEST(gettimeofday_tv_tz);EXPECT_SYSZR(1, gettimeofday(&tv, &tz)); break; CASE_TEST(getpagesize); EXPECT_SYSZR(1, test_getpagesize()); break; @@ -1073,8 +1130,9 @@ int run_syscall(int min, int max) CASE_TEST(mmap_bad); EXPECT_PTRER(1, mmap(NULL, 0, PROT_READ, MAP_PRIVATE, 0, 0), MAP_FAILED, EINVAL); break; CASE_TEST(munmap_bad); EXPECT_SYSER(1, munmap(NULL, 0), -1, EINVAL); break; CASE_TEST(mmap_munmap_good); EXPECT_SYSZR(1, test_mmap_munmap()); break; - CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", 0), -1); if (tmp != -1) close(tmp); break; - CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", 0), -1, ENOENT); if (tmp != -1) close(tmp); break; + CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", O_RDONLY), -1); if (tmp != -1) close(tmp); break; + CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", O_RDONLY), -1, ENOENT); if (tmp != -1) close(tmp); break; + CASE_TEST(openat_dir); EXPECT_SYSZR(1, test_openat()); break; CASE_TEST(pipe); EXPECT_SYSZR(1, test_pipe()); break; CASE_TEST(poll_null); EXPECT_SYSZR(1, poll(NULL, 0, 0)); break; CASE_TEST(poll_stdout); EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break; @@ -1284,6 +1342,73 @@ static int expect_vfprintf(int llen, int c, const char *expected, const char *fm return ret; } +static int test_scanf(void) +{ + unsigned long long ull; + unsigned long ul; + unsigned int u; + long long ll; + long l; + void *p; + int i; + + /* return __LINE__ to point to the specific failure */ + + /* test EOF */ + if (sscanf("", "foo") != EOF) + return __LINE__; + + /* test simple literal without placeholder */ + if (sscanf("foo", "foo") != 0) + return __LINE__; + + /* test single placeholder */ + if (sscanf("123", "%d", &i) != 1) + return __LINE__; + + if (i != 123) + return __LINE__; + + /* test multiple place holders and separators */ + if (sscanf("a123b456c0x90", "a%db%uc%p", &i, &u, &p) != 3) + return __LINE__; + + if (i != 123) + return __LINE__; + + if (u != 456) + return __LINE__; + + if (p != (void *)0x90) + return __LINE__; + + /* test space handling */ + if (sscanf("a b1", "a b%d", &i) != 1) + return __LINE__; + + if (i != 1) + return __LINE__; + + /* test literal percent */ + if (sscanf("a%1", "a%%%d", &i) != 1) + return __LINE__; + + if (i != 1) + return __LINE__; + + /* test stdint.h types */ + if (sscanf("1|2|3|4|5|6", + "%d|%ld|%lld|%u|%lu|%llu", + &i, &l, &ll, &u, &ul, &ull) != 6) + return __LINE__; + + if (i != 1 || l != 2 || ll != 3 || + u != 4 || ul != 5 || ull != 6) + return __LINE__; + + return 0; +} + static int run_vfprintf(int min, int max) { int test; @@ -1305,6 +1430,7 @@ static int run_vfprintf(int min, int max) CASE_TEST(char); EXPECT_VFPRINTF(1, "c", "%c", 'c'); break; CASE_TEST(hex); EXPECT_VFPRINTF(1, "f", "%x", 0xf); break; CASE_TEST(pointer); EXPECT_VFPRINTF(3, "0x1", "%p", (void *) 0x1); break; + CASE_TEST(scanf); EXPECT_ZR(1, test_scanf()); break; case __LINE__: return ret; /* must be last */ /* note: do not set any defaults so as to permit holes above */ diff --git a/tools/testing/selftests/nolibc/run-tests.sh b/tools/testing/selftests/nolibc/run-tests.sh index 9c5160c53881..0299a0912d40 100755 --- a/tools/testing/selftests/nolibc/run-tests.sh +++ b/tools/testing/selftests/nolibc/run-tests.sh @@ -17,7 +17,16 @@ perform_download=0 test_mode=system werror=1 llvm= -archs="i386 x86_64 arm64 arm mips32le mips32be ppc ppc64 ppc64le riscv32 riscv64 s390 loongarch" +all_archs=( + i386 x86_64 + arm64 arm armthumb + mips32le mips32be + ppc ppc64 ppc64le + riscv32 riscv64 + s390x s390 + loongarch +) +archs="${all_archs[@]}" TEMP=$(getopt -o 'j:d:c:b:a:m:pelh' -n "$0" -- "$@") @@ -94,19 +103,21 @@ fi crosstool_arch() { case "$1" in arm64) echo aarch64;; + armthumb) echo arm;; ppc) echo powerpc;; ppc64) echo powerpc64;; ppc64le) echo powerpc64;; riscv) echo riscv64;; loongarch) echo loongarch64;; mips*) echo mips;; + s390*) echo s390;; *) echo "$1";; esac } crosstool_abi() { case "$1" in - arm) echo linux-gnueabi;; + arm | armthumb) echo linux-gnueabi;; *) echo linux;; esac } @@ -157,10 +168,6 @@ test_arch() { fi MAKE=(make -j"${nproc}" XARCH="${arch}" CROSS_COMPILE="${cross_compile}" LLVM="${llvm}" O="${build_dir}") - mkdir -p "$build_dir" - if [ "$test_mode" = "system" ] && [ ! -f "${build_dir}/.config" ]; then - swallow_output "${MAKE[@]}" defconfig - fi case "$test_mode" in 'system') test_target=run @@ -173,6 +180,13 @@ test_arch() { exit 1 esac printf '%-15s' "$arch:" + if [ "$arch" = "s390" ] && ([ "$llvm" = "1" ] || [ "$test_mode" = "user" ]); then + echo "Unsupported configuration" + return + fi + + mkdir -p "$build_dir" + swallow_output "${MAKE[@]}" defconfig swallow_output "${MAKE[@]}" CFLAGS_EXTRA="$CFLAGS_EXTRA" "$test_target" V=1 cp run.out run.out."${arch}" "${MAKE[@]}" report | grep passed diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index bf92481f925c..0406a065deb4 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -8,3 +8,5 @@ pidfd_getfd_test pidfd_setns_test pidfd_file_handle_test pidfd_bind_mount +pidfd_info_test +pidfd_exec_helper diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 301343a11b62..fcbefc0d77f6 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -3,7 +3,9 @@ CFLAGS += -g $(KHDR_INCLUDES) -pthread -Wall TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \ - pidfd_file_handle_test pidfd_bind_mount + pidfd_file_handle_test pidfd_bind_mount pidfd_info_test + +TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 0b96ac4b8ce5..cec22aa11cdf 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -12,6 +12,7 @@ #include <stdlib.h> #include <string.h> #include <syscall.h> +#include <sys/ioctl.h> #include <sys/types.h> #include <sys/wait.h> @@ -50,6 +51,107 @@ #define PIDFD_NONBLOCK O_NONBLOCK #endif +#ifndef PIDFD_SELF_THREAD +#define PIDFD_SELF_THREAD -10000 /* Current thread. */ +#endif + +#ifndef PIDFD_SELF_THREAD_GROUP +#define PIDFD_SELF_THREAD_GROUP -20000 /* Current thread group leader. */ +#endif + +#ifndef PIDFD_SELF +#define PIDFD_SELF PIDFD_SELF_THREAD +#endif + +#ifndef PIDFD_SELF_PROCESS +#define PIDFD_SELF_PROCESS PIDFD_SELF_THREAD_GROUP +#endif + +#ifndef PIDFS_IOCTL_MAGIC +#define PIDFS_IOCTL_MAGIC 0xFF +#endif + +#ifndef PIDFD_GET_CGROUP_NAMESPACE +#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) +#endif + +#ifndef PIDFD_GET_IPC_NAMESPACE +#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) +#endif + +#ifndef PIDFD_GET_MNT_NAMESPACE +#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) +#endif + +#ifndef PIDFD_GET_NET_NAMESPACE +#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) +#endif + +#ifndef PIDFD_GET_PID_NAMESPACE +#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) +#endif + +#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) +#endif + +#ifndef PIDFD_GET_TIME_NAMESPACE +#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) +#endif + +#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE +#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) +#endif + +#ifndef PIDFD_GET_USER_NAMESPACE +#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) +#endif + +#ifndef PIDFD_GET_UTS_NAMESPACE +#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) +#endif + +#ifndef PIDFD_GET_INFO +#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info) +#endif + +#ifndef PIDFD_INFO_PID +#define PIDFD_INFO_PID (1UL << 0) /* Always returned, even if not requested */ +#endif + +#ifndef PIDFD_INFO_CREDS +#define PIDFD_INFO_CREDS (1UL << 1) /* Always returned, even if not requested */ +#endif + +#ifndef PIDFD_INFO_CGROUPID +#define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */ +#endif + +#ifndef PIDFD_INFO_EXIT +#define PIDFD_INFO_EXIT (1UL << 3) /* Always returned if available, even if not requested */ +#endif + +#ifndef PIDFD_THREAD +#define PIDFD_THREAD O_EXCL +#endif + +struct pidfd_info { + __u64 mask; + __u64 cgroupid; + __u32 pid; + __u32 tgid; + __u32 ppid; + __u32 ruid; + __u32 rgid; + __u32 euid; + __u32 egid; + __u32 suid; + __u32 sgid; + __u32 fsuid; + __u32 fsgid; + __s32 exit_code; +}; + /* * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c * That means, when it wraps around any pid < 300 will be skipped. @@ -152,4 +254,11 @@ static inline ssize_t write_nointr(int fd, const void *buf, size_t count) return ret; } +static inline int sys_execveat(int dirfd, const char *pathname, + char *const argv[], char *const envp[], + int flags) +{ + return syscall(__NR_execveat, dirfd, pathname, argv, envp, flags); +} + #endif /* __PIDFD_H */ diff --git a/tools/testing/selftests/pidfd/pidfd_exec_helper.c b/tools/testing/selftests/pidfd/pidfd_exec_helper.c new file mode 100644 index 000000000000..5516808c95f2 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_exec_helper.c @@ -0,0 +1,12 @@ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +int main(int argc, char *argv[]) +{ + if (pause()) + _exit(EXIT_FAILURE); + + _exit(EXIT_SUCCESS); +} diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index f062a986e382..f718aac75068 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -13,6 +13,7 @@ #include <syscall.h> #include <sys/wait.h> #include <sys/mman.h> +#include <sys/mount.h> #include "pidfd.h" #include "../kselftest.h" diff --git a/tools/testing/selftests/pidfd/pidfd_info_test.c b/tools/testing/selftests/pidfd/pidfd_info_test.c new file mode 100644 index 000000000000..1758a1b0457b --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_info_test.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/types.h> +#include <poll.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sys/socket.h> +#include <linux/kcmp.h> +#include <sys/stat.h> + +#include "pidfd.h" +#include "../kselftest_harness.h" + +FIXTURE(pidfd_info) +{ + pid_t child_pid1; + int child_pidfd1; + + pid_t child_pid2; + int child_pidfd2; + + pid_t child_pid3; + int child_pidfd3; + + pid_t child_pid4; + int child_pidfd4; +}; + +FIXTURE_SETUP(pidfd_info) +{ + int ret; + int ipc_sockets[2]; + char c; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid1 = create_child(&self->child_pidfd1, 0); + EXPECT_GE(self->child_pid1, 0); + + if (self->child_pid1 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + /* SIGKILL but don't reap. */ + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0), 0); + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + self->child_pid2 = create_child(&self->child_pidfd2, 0); + EXPECT_GE(self->child_pid2, 0); + + if (self->child_pid2 == 0) { + close(ipc_sockets[0]); + + if (write_nointr(ipc_sockets[1], "1", 1) < 0) + _exit(EXIT_FAILURE); + + close(ipc_sockets[1]); + + pause(); + _exit(EXIT_SUCCESS); + } + + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + /* SIGKILL and reap. */ + EXPECT_EQ(sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0), 0); + EXPECT_EQ(sys_waitid(P_PID, self->child_pid2, NULL, WEXITED), 0); + + self->child_pid3 = create_child(&self->child_pidfd3, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid3, 0); + + if (self->child_pid3 == 0) + _exit(EXIT_SUCCESS); + + self->child_pid4 = create_child(&self->child_pidfd4, CLONE_NEWUSER | CLONE_NEWPID); + EXPECT_GE(self->child_pid4, 0); + + if (self->child_pid4 == 0) + _exit(EXIT_SUCCESS); + + EXPECT_EQ(sys_waitid(P_PID, self->child_pid4, NULL, WEXITED), 0); +} + +FIXTURE_TEARDOWN(pidfd_info) +{ + sys_pidfd_send_signal(self->child_pidfd1, SIGKILL, NULL, 0); + if (self->child_pidfd1 >= 0) + EXPECT_EQ(0, close(self->child_pidfd1)); + + sys_waitid(P_PID, self->child_pid1, NULL, WEXITED); + + sys_pidfd_send_signal(self->child_pidfd2, SIGKILL, NULL, 0); + if (self->child_pidfd2 >= 0) + EXPECT_EQ(0, close(self->child_pidfd2)); + + sys_waitid(P_PID, self->child_pid2, NULL, WEXITED); + sys_waitid(P_PID, self->child_pid3, NULL, WEXITED); + sys_waitid(P_PID, self->child_pid4, NULL, WEXITED); +} + +TEST_F(pidfd_info, sigkill_exit) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has exited but not been reaped so this must work. */ + ASSERT_EQ(ioctl(self->child_pidfd1, PIDFD_GET_INFO, &info), 0); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd1, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); +} + +TEST_F(pidfd_info, sigkill_reaped) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has already been reaped and PIDFD_INFO_EXIT hasn't been set. */ + ASSERT_NE(ioctl(self->child_pidfd2, PIDFD_GET_INFO, &info), 0); + ASSERT_EQ(errno, ESRCH); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd2, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); +} + +TEST_F(pidfd_info, success_exit) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has exited but not been reaped so this must work. */ + ASSERT_EQ(ioctl(self->child_pidfd3, PIDFD_GET_INFO, &info), 0); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd3, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); +} + +TEST_F(pidfd_info, success_reaped) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID, + }; + + /* Process has already been reaped and PIDFD_INFO_EXIT hasn't been set. */ + ASSERT_NE(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); + ASSERT_EQ(errno, ESRCH); + + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); +} + +TEST_F(pidfd_info, success_reaped_poll) +{ + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }; + struct pollfd fds = {}; + int nevents; + + fds.events = POLLIN; + fds.fd = self->child_pidfd2; + + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + ASSERT_EQ(ioctl(self->child_pidfd2, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); +} + +static void *pidfd_info_pause_thread(void *arg) +{ + pid_t pid_thread = gettid(); + int ipc_socket = *(int *)arg; + + /* Inform the grand-parent what the tid of this thread is. */ + if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + close(ipc_socket); + + /* Sleep untill we're killed. */ + pause(); + return NULL; +} + +TEST_F(pidfd_info, thread_group) +{ + pid_t pid_leader, pid_poller, pid_thread; + pthread_t thread; + int nevents, pidfd_leader, pidfd_thread, pidfd_leader_thread, ret; + int ipc_sockets[2]; + struct pollfd fds = {}; + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }, info2; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + pid_leader = create_child(&pidfd_leader, 0); + EXPECT_GE(pid_leader, 0); + + if (pid_leader == 0) { + close(ipc_sockets[0]); + + /* The thread will outlive the thread-group leader. */ + if (pthread_create(&thread, NULL, pidfd_info_pause_thread, &ipc_sockets[1])) + syscall(__NR_exit, EXIT_FAILURE); + + /* Make the thread-group leader exit prematurely. */ + syscall(__NR_exit, EXIT_SUCCESS); + } + + /* + * Opening a PIDFD_THREAD aka thread-specific pidfd based on a + * thread-group leader must succeed. + */ + pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD); + ASSERT_GE(pidfd_leader_thread, 0); + + pid_poller = fork(); + ASSERT_GE(pid_poller, 0); + if (pid_poller == 0) { + /* + * We can't poll and wait for the old thread-group + * leader to exit using a thread-specific pidfd. The + * thread-group leader exited prematurely and + * notification is delayed until all subthreads have + * exited. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, 10000 /* wait 5 seconds */); + if (nevents != 0) + _exit(EXIT_FAILURE); + if (fds.revents & POLLIN) + _exit(EXIT_FAILURE); + if (fds.revents & POLLHUP) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + /* Retrieve the tid of the thread. */ + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + /* Opening a thread as a thread-group leader must fail. */ + pidfd_thread = sys_pidfd_open(pid_thread, 0); + ASSERT_LT(pidfd_thread, 0); + + /* Opening a thread as a PIDFD_THREAD must succeed. */ + pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); + ASSERT_GE(pidfd_thread, 0); + + ASSERT_EQ(wait_for_pid(pid_poller), 0); + + /* + * Note that pidfd_leader is a thread-group pidfd, so polling on it + * would only notify us once all thread in the thread-group have + * exited. So we can't poll before we have taken down the whole + * thread-group. + */ + + /* Get PIDFD_GET_INFO using the thread-group leader pidfd. */ + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_leader); + + /* + * Now retrieve the same info using the thread specific pidfd + * for the thread-group leader. + */ + info2.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader_thread, PIDFD_GET_INFO, &info2), 0); + ASSERT_TRUE(!!(info2.mask & PIDFD_INFO_CREDS)); + /* Process has exited but not been reaped, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info2.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info2.pid, pid_leader); + + /* Now try the thread-specific pidfd. */ + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + /* The thread hasn't exited, so no PIDFD_INFO_EXIT information yet. */ + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_thread); + + /* + * Take down the whole thread-group. The thread-group leader + * exited successfully but the thread will now be SIGKILLed. + * This must be reflected in the recorded exit information. + */ + EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0); + EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0); + + fds.events = POLLIN; + fds.fd = pidfd_leader; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + /* The thread-group leader has been reaped. */ + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + /* + * Retrieve exit information for the thread-group leader via the + * thread-group leader pidfd. + */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + /* + * Retrieve exit information for the thread-group leader via the + * thread-specific pidfd. + */ + info2.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader_thread, PIDFD_GET_INFO, &info2), 0); + ASSERT_FALSE(!!(info2.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info2.mask & PIDFD_INFO_EXIT)); + + /* The thread-group leader exited successfully. Only the specific thread was SIGKILLed. */ + ASSERT_TRUE(WIFEXITED(info2.exit_code)); + ASSERT_EQ(WEXITSTATUS(info2.exit_code), 0); + + /* Retrieve exit information for the thread. */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + + /* The thread got SIGKILLed. */ + ASSERT_TRUE(WIFSIGNALED(info.exit_code)); + ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL); + + EXPECT_EQ(close(pidfd_leader), 0); + EXPECT_EQ(close(pidfd_thread), 0); +} + +static void *pidfd_info_thread_exec(void *arg) +{ + pid_t pid_thread = gettid(); + int ipc_socket = *(int *)arg; + + /* Inform the grand-parent what the tid of this thread is. */ + if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + if (read_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + close(ipc_socket); + + sys_execveat(AT_FDCWD, "pidfd_exec_helper", NULL, NULL, 0); + return NULL; +} + +TEST_F(pidfd_info, thread_group_exec) +{ + pid_t pid_leader, pid_poller, pid_thread; + pthread_t thread; + int nevents, pidfd_leader, pidfd_leader_thread, pidfd_thread, ret; + int ipc_sockets[2]; + struct pollfd fds = {}; + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + pid_leader = create_child(&pidfd_leader, 0); + EXPECT_GE(pid_leader, 0); + + if (pid_leader == 0) { + close(ipc_sockets[0]); + + /* The thread will outlive the thread-group leader. */ + if (pthread_create(&thread, NULL, pidfd_info_thread_exec, &ipc_sockets[1])) + syscall(__NR_exit, EXIT_FAILURE); + + /* Make the thread-group leader exit prematurely. */ + syscall(__NR_exit, EXIT_SUCCESS); + } + + /* Open a thread-specific pidfd for the thread-group leader. */ + pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD); + ASSERT_GE(pidfd_leader_thread, 0); + + pid_poller = fork(); + ASSERT_GE(pid_poller, 0); + if (pid_poller == 0) { + /* + * We can't poll and wait for the old thread-group + * leader to exit using a thread-specific pidfd. The + * thread-group leader exited prematurely and + * notification is delayed until all subthreads have + * exited. + * + * When the thread has execed it will taken over the old + * thread-group leaders struct pid. Calling poll after + * the thread execed will thus block again because a new + * thread-group has started. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, 10000 /* wait 5 seconds */); + if (nevents != 0) + _exit(EXIT_FAILURE); + if (fds.revents & POLLIN) + _exit(EXIT_FAILURE); + if (fds.revents & POLLHUP) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + /* Retrieve the tid of the thread. */ + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + + /* Opening a thread as a PIDFD_THREAD must succeed. */ + pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); + ASSERT_GE(pidfd_thread, 0); + + /* Now that we've opened a thread-specific pidfd the thread can exec. */ + ASSERT_EQ(write_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + EXPECT_EQ(close(ipc_sockets[0]), 0); + + ASSERT_EQ(wait_for_pid(pid_poller), 0); + + /* Wait until the kernel has SIGKILLed the thread. */ + fds.events = POLLHUP; + fds.fd = pidfd_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + /* The thread has been reaped. */ + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + /* Retrieve thread-specific exit info from pidfd. */ + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + /* + * While the kernel will have SIGKILLed the whole thread-group + * during exec it will cause the individual threads to exit + * cleanly. + */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + /* + * The thread-group leader is still alive, the thread has taken + * over its struct pid and thus its pid number. + */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_leader); + + /* Take down the thread-group leader. */ + EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0); + + /* + * Afte the exec we're dealing with an empty thread-group so now + * we must see an exit notification on the thread-specific pidfd + * for the thread-group leader as there's no subthread that can + * revive the struct pid. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + ASSERT_FALSE(!!(fds.revents & POLLHUP)); + + EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0); + + /* Retrieve exit information for the thread-group leader. */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + + EXPECT_EQ(close(pidfd_leader), 0); + EXPECT_EQ(close(pidfd_thread), 0); +} + +static void *pidfd_info_thread_exec_sane(void *arg) +{ + pid_t pid_thread = gettid(); + int ipc_socket = *(int *)arg; + + /* Inform the grand-parent what the tid of this thread is. */ + if (write_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + if (read_nointr(ipc_socket, &pid_thread, sizeof(pid_thread)) != sizeof(pid_thread)) + return NULL; + + close(ipc_socket); + + sys_execveat(AT_FDCWD, "pidfd_exec_helper", NULL, NULL, 0); + return NULL; +} + +TEST_F(pidfd_info, thread_group_exec_thread) +{ + pid_t pid_leader, pid_poller, pid_thread; + pthread_t thread; + int nevents, pidfd_leader, pidfd_leader_thread, pidfd_thread, ret; + int ipc_sockets[2]; + struct pollfd fds = {}; + struct pidfd_info info = { + .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, + }; + + ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets); + EXPECT_EQ(ret, 0); + + pid_leader = create_child(&pidfd_leader, 0); + EXPECT_GE(pid_leader, 0); + + if (pid_leader == 0) { + close(ipc_sockets[0]); + + /* The thread will outlive the thread-group leader. */ + if (pthread_create(&thread, NULL, pidfd_info_thread_exec_sane, &ipc_sockets[1])) + syscall(__NR_exit, EXIT_FAILURE); + + /* + * Pause the thread-group leader. It will be killed once + * the subthread execs. + */ + pause(); + syscall(__NR_exit, EXIT_SUCCESS); + } + + /* Retrieve the tid of the thread. */ + EXPECT_EQ(close(ipc_sockets[1]), 0); + ASSERT_EQ(read_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + + /* Opening a thread as a PIDFD_THREAD must succeed. */ + pidfd_thread = sys_pidfd_open(pid_thread, PIDFD_THREAD); + ASSERT_GE(pidfd_thread, 0); + + /* Open a thread-specific pidfd for the thread-group leader. */ + pidfd_leader_thread = sys_pidfd_open(pid_leader, PIDFD_THREAD); + ASSERT_GE(pidfd_leader_thread, 0); + + pid_poller = fork(); + ASSERT_GE(pid_poller, 0); + if (pid_poller == 0) { + /* + * The subthread will now exec. The struct pid of the old + * thread-group leader will be assumed by the subthread which + * becomes the new thread-group leader. So no exit notification + * must be generated. Wait for 5 seconds and call it a success + * if no notification has been received. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, 10000 /* wait 5 seconds */); + if (nevents != 0) + _exit(EXIT_FAILURE); + if (fds.revents & POLLIN) + _exit(EXIT_FAILURE); + if (fds.revents & POLLHUP) + _exit(EXIT_FAILURE); + _exit(EXIT_SUCCESS); + } + + /* Now that we've opened a thread-specific pidfd the thread can exec. */ + ASSERT_EQ(write_nointr(ipc_sockets[0], &pid_thread, sizeof(pid_thread)), sizeof(pid_thread)); + EXPECT_EQ(close(ipc_sockets[0]), 0); + ASSERT_EQ(wait_for_pid(pid_poller), 0); + + /* Wait until the kernel has SIGKILLed the thread. */ + fds.events = POLLHUP; + fds.fd = pidfd_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + /* The thread has been reaped. */ + ASSERT_TRUE(!!(fds.revents & POLLHUP)); + + /* Retrieve thread-specific exit info from pidfd. */ + ASSERT_EQ(ioctl(pidfd_thread, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + /* + * While the kernel will have SIGKILLed the whole thread-group + * during exec it will cause the individual threads to exit + * cleanly. + */ + ASSERT_TRUE(WIFEXITED(info.exit_code)); + ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); + + /* + * The thread-group leader is still alive, the thread has taken + * over its struct pid and thus its pid number. + */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_EXIT)); + ASSERT_EQ(info.pid, pid_leader); + + /* Take down the thread-group leader. */ + EXPECT_EQ(sys_pidfd_send_signal(pidfd_leader, SIGKILL, NULL, 0), 0); + + /* + * Afte the exec we're dealing with an empty thread-group so now + * we must see an exit notification on the thread-specific pidfd + * for the thread-group leader as there's no subthread that can + * revive the struct pid. + */ + fds.events = POLLIN; + fds.fd = pidfd_leader_thread; + nevents = poll(&fds, 1, -1); + ASSERT_EQ(nevents, 1); + ASSERT_TRUE(!!(fds.revents & POLLIN)); + ASSERT_FALSE(!!(fds.revents & POLLHUP)); + + EXPECT_EQ(sys_waitid(P_PIDFD, pidfd_leader, NULL, WEXITED), 0); + + /* Retrieve exit information for the thread-group leader. */ + info.mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT; + ASSERT_EQ(ioctl(pidfd_leader, PIDFD_GET_INFO, &info), 0); + ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); + ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); + + EXPECT_EQ(close(pidfd_leader), 0); + EXPECT_EQ(close(pidfd_thread), 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c index ce413a221bac..cd3de40e4977 100644 --- a/tools/testing/selftests/pidfd/pidfd_open_test.c +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -22,32 +22,6 @@ #include "pidfd.h" #include "../kselftest.h" -#ifndef PIDFS_IOCTL_MAGIC -#define PIDFS_IOCTL_MAGIC 0xFF -#endif - -#ifndef PIDFD_GET_INFO -#define PIDFD_GET_INFO _IOWR(PIDFS_IOCTL_MAGIC, 11, struct pidfd_info) -#define PIDFD_INFO_CGROUPID (1UL << 0) - -struct pidfd_info { - __u64 request_mask; - __u64 cgroupid; - __u32 pid; - __u32 tgid; - __u32 ppid; - __u32 ruid; - __u32 rgid; - __u32 euid; - __u32 egid; - __u32 suid; - __u32 sgid; - __u32 fsuid; - __u32 fsgid; - __u32 spare0[1]; -}; -#endif - static int safe_int(const char *numstr, int *converted) { char *err = NULL; @@ -148,7 +122,7 @@ out: int main(int argc, char **argv) { struct pidfd_info info = { - .request_mask = PIDFD_INFO_CGROUPID, + .mask = PIDFD_INFO_CGROUPID, }; int pidfd = -1, ret = 1; pid_t pid; @@ -227,7 +201,7 @@ int main(int argc, char **argv) getegid(), info.sgid); goto on_error; } - if ((info.request_mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) { + if ((info.mask & PIDFD_INFO_CGROUPID) && info.cgroupid == 0) { ksft_print_msg("cgroupid should not be 0 when PIDFD_INFO_CGROUPID is set\n"); goto on_error; } diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c index 222f8131283b..e6a079b3d5e2 100644 --- a/tools/testing/selftests/pidfd/pidfd_setns_test.c +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -16,55 +16,10 @@ #include <unistd.h> #include <sys/socket.h> #include <sys/stat.h> -#include <linux/ioctl.h> #include "pidfd.h" #include "../kselftest_harness.h" -#ifndef PIDFS_IOCTL_MAGIC -#define PIDFS_IOCTL_MAGIC 0xFF -#endif - -#ifndef PIDFD_GET_CGROUP_NAMESPACE -#define PIDFD_GET_CGROUP_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 1) -#endif - -#ifndef PIDFD_GET_IPC_NAMESPACE -#define PIDFD_GET_IPC_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 2) -#endif - -#ifndef PIDFD_GET_MNT_NAMESPACE -#define PIDFD_GET_MNT_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 3) -#endif - -#ifndef PIDFD_GET_NET_NAMESPACE -#define PIDFD_GET_NET_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 4) -#endif - -#ifndef PIDFD_GET_PID_NAMESPACE -#define PIDFD_GET_PID_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 5) -#endif - -#ifndef PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE -#define PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 6) -#endif - -#ifndef PIDFD_GET_TIME_NAMESPACE -#define PIDFD_GET_TIME_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 7) -#endif - -#ifndef PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE -#define PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 8) -#endif - -#ifndef PIDFD_GET_USER_NAMESPACE -#define PIDFD_GET_USER_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 9) -#endif - -#ifndef PIDFD_GET_UTS_NAMESPACE -#define PIDFD_GET_UTS_NAMESPACE _IO(PIDFS_IOCTL_MAGIC, 10) -#endif - enum { PIDFD_NS_USER, PIDFD_NS_MNT, diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c index e9728e86b4f2..fcd85cad9f18 100644 --- a/tools/testing/selftests/pidfd/pidfd_test.c +++ b/tools/testing/selftests/pidfd/pidfd_test.c @@ -42,12 +42,41 @@ static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *)) #endif } -static int signal_received; +static pthread_t signal_received; static void set_signal_received_on_sigusr1(int sig) { if (sig == SIGUSR1) - signal_received = 1; + signal_received = pthread_self(); +} + +static int send_signal(int pidfd) +{ + int ret = 0; + + if (sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0) < 0) { + ret = -EINVAL; + goto exit; + } + + if (signal_received != pthread_self()) { + ret = -EINVAL; + goto exit; + } + +exit: + signal_received = 0; + return ret; +} + +static void *send_signal_worker(void *arg) +{ + int pidfd = (int)(intptr_t)arg; + int ret; + + /* We forward any errors for the caller to handle. */ + ret = send_signal(pidfd); + return (void *)(intptr_t)ret; } /* @@ -56,8 +85,11 @@ static void set_signal_received_on_sigusr1(int sig) */ static int test_pidfd_send_signal_simple_success(void) { - int pidfd, ret; + int pidfd; const char *test_name = "pidfd_send_signal send SIGUSR1"; + pthread_t thread; + void *thread_res; + int err; if (!have_pidfd_send_signal) { ksft_test_result_skip( @@ -66,25 +98,45 @@ static int test_pidfd_send_signal_simple_success(void) return 0; } + signal(SIGUSR1, set_signal_received_on_sigusr1); + + /* Try sending a signal to ourselves via /proc/self. */ pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC); if (pidfd < 0) ksft_exit_fail_msg( "%s test: Failed to open process file descriptor\n", test_name); + err = send_signal(pidfd); + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on sending pidfd signal\n", + test_name, err); + close(pidfd); - signal(SIGUSR1, set_signal_received_on_sigusr1); + /* Now try the same thing only using PIDFD_SELF_THREAD_GROUP. */ + err = send_signal(PIDFD_SELF_THREAD_GROUP); + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on PIDFD_SELF_THREAD_GROUP signal\n", + test_name, err); - ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0); - close(pidfd); - if (ret < 0) - ksft_exit_fail_msg("%s test: Failed to send signal\n", + /* + * Now try the same thing in a thread and assert thread ID is equal to + * worker thread ID. + */ + if (pthread_create(&thread, NULL, send_signal_worker, + (void *)(intptr_t)PIDFD_SELF_THREAD)) + ksft_exit_fail_msg("%s test: Failed to create thread\n", test_name); - - if (signal_received != 1) - ksft_exit_fail_msg("%s test: Failed to receive signal\n", + if (pthread_join(thread, &thread_res)) + ksft_exit_fail_msg("%s test: Failed to join thread\n", test_name); + err = (int)(intptr_t)thread_res; + if (err) + ksft_exit_fail_msg( + "%s test: Error %d on PIDFD_SELF_THREAD signal\n", + test_name, err); - signal_received = 0; ksft_test_result_pass("%s test: Sent signal\n", test_name); return 0; } diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h index 3a0129467de6..d6deb6ffa1b9 100644 --- a/tools/testing/selftests/powerpc/include/pkeys.h +++ b/tools/testing/selftests/powerpc/include/pkeys.h @@ -24,6 +24,9 @@ #undef PKEY_DISABLE_EXECUTE #define PKEY_DISABLE_EXECUTE 0x4 +#undef PKEY_UNRESTRICTED +#define PKEY_UNRESTRICTED 0x0 + /* Older versions of libc do not define this */ #ifndef SEGV_PKUERR #define SEGV_PKUERR 4 @@ -93,7 +96,7 @@ int pkeys_unsupported(void) SKIP_IF(!hash_mmu); /* Check if the system call is supported */ - pkey = sys_pkey_alloc(0, 0); + pkey = sys_pkey_alloc(0, PKEY_UNRESTRICTED); SKIP_IF(pkey < 0); sys_pkey_free(pkey); diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c index 0af4f02669a1..29b91b7456eb 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c @@ -72,7 +72,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) switch (fault_type) { case PKEY_DISABLE_ACCESS: - pkey_set_rights(fault_pkey, 0); + pkey_set_rights(fault_pkey, PKEY_UNRESTRICTED); break; case PKEY_DISABLE_EXECUTE: /* diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c index 2db76e56d4cb..e89a164c686b 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c +++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c @@ -83,7 +83,7 @@ static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) mprotect(pgstart, pgsize, PROT_EXEC)) _exit(1); else - pkey_set_rights(pkey, 0); + pkey_set_rights(pkey, PKEY_UNRESTRICTED); fault_count++; } diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c index f061434af452..7ff53caeb4aa 100644 --- a/tools/testing/selftests/powerpc/ptrace/core-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c @@ -95,16 +95,16 @@ static int child(struct shared_info *info) /* Get some pkeys so that we can change their bits in the AMR. */ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE); if (pkey1 < 0) { - pkey1 = sys_pkey_alloc(0, 0); + pkey1 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey1 < 0); disable_execute = false; } - pkey2 = sys_pkey_alloc(0, 0); + pkey2 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey2 < 0); - pkey3 = sys_pkey_alloc(0, 0); + pkey3 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); FAIL_IF(pkey3 < 0); info->amr |= 3ul << pkeyshift(pkey1) | 2ul << pkeyshift(pkey2); diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c index fc633014424f..10f63042cf91 100644 --- a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c +++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c @@ -57,16 +57,16 @@ static int child(struct shared_info *info) /* Get some pkeys so that we can change their bits in the AMR. */ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE); if (pkey1 < 0) { - pkey1 = sys_pkey_alloc(0, 0); + pkey1 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey1 < 0, &info->child_sync); disable_execute = false; } - pkey2 = sys_pkey_alloc(0, 0); + pkey2 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey2 < 0, &info->child_sync); - pkey3 = sys_pkey_alloc(0, 0); + pkey3 = sys_pkey_alloc(0, PKEY_UNRESTRICTED); CHILD_FAIL_IF(pkey3 < 0, &info->child_sync); info->amr1 |= 3ul << pkeyshift(pkey1); diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c index 58064151f2c8..edc08a4433fd 100644 --- a/tools/testing/selftests/ptp/testptp.c +++ b/tools/testing/selftests/ptp/testptp.c @@ -140,6 +140,7 @@ static void usage(char *progname) " -H val set output phase to 'val' nanoseconds (requires -p)\n" " -w val set output pulse width to 'val' nanoseconds (requires -p)\n" " -P val enable or disable (val=1|0) the system clock PPS\n" + " -r open the ptp clock in readonly mode\n" " -s set the ptp clock time from the system time\n" " -S set the system time from the ptp clock time\n" " -t val shift the ptp clock time by 'val' seconds\n" @@ -188,6 +189,7 @@ int main(int argc, char *argv[]) int pin_index = -1, pin_func; int pps = -1; int seconds = 0; + int readonly = 0; int settime = 0; int channel = -1; clockid_t ext_clockid = CLOCK_REALTIME; @@ -200,7 +202,7 @@ int main(int argc, char *argv[]) progname = strrchr(argv[0], '/'); progname = progname ? 1+progname : argv[0]; - while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:sSt:T:w:x:Xy:z"))) { + while (EOF != (c = getopt(argc, argv, "cd:e:f:F:ghH:i:k:lL:n:o:p:P:rsSt:T:w:x:Xy:z"))) { switch (c) { case 'c': capabilities = 1; @@ -252,6 +254,9 @@ int main(int argc, char *argv[]) case 'P': pps = atoi(optarg); break; + case 'r': + readonly = 1; + break; case 's': settime = 1; break; @@ -308,7 +313,7 @@ int main(int argc, char *argv[]) } } - fd = open(device, O_RDWR); + fd = open(device, readonly ? O_RDONLY : O_RDWR); if (fd < 0) { fprintf(stderr, "opening %s: %s\n", device, strerror(errno)); return -1; @@ -436,14 +441,16 @@ int main(int argc, char *argv[]) } if (extts) { - memset(&extts_request, 0, sizeof(extts_request)); - extts_request.index = index; - extts_request.flags = PTP_ENABLE_FEATURE; - if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) { - perror("PTP_EXTTS_REQUEST"); - extts = 0; - } else { - puts("external time stamp request okay"); + if (!readonly) { + memset(&extts_request, 0, sizeof(extts_request)); + extts_request.index = index; + extts_request.flags = PTP_ENABLE_FEATURE; + if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) { + perror("PTP_EXTTS_REQUEST"); + extts = 0; + } else { + puts("external time stamp request okay"); + } } for (; extts; extts--) { cnt = read(fd, &event, sizeof(event)); @@ -455,10 +462,12 @@ int main(int argc, char *argv[]) event.t.sec, event.t.nsec); fflush(stdout); } - /* Disable the feature again. */ - extts_request.flags = 0; - if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) { - perror("PTP_EXTTS_REQUEST"); + if (!readonly) { + /* Disable the feature again. */ + extts_request.flags = 0; + if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) { + perror("PTP_EXTTS_REQUEST"); + } } } diff --git a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh index 2e63ef009d59..2db12c5cad9c 100755 --- a/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh +++ b/tools/testing/selftests/rcutorture/bin/srcu_lockdep.sh @@ -49,7 +49,7 @@ do do err= val=$((d*1000+t*10+c)) - tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --bootargs "rcutorture.test_srcu_lockdep=$val" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 5s --configs "SRCU-P" --kconfig "CONFIG_FORCE_NEED_SRCU_NMI_SAFE=y" --bootargs "rcutorture.test_srcu_lockdep=$val rcutorture.reader_flavor=0x2" --trust-make --datestamp "$ds/$val" > "$T/kvm.sh.out" 2>&1 ret=$? mv "$T/kvm.sh.out" "$RCUTORTURE/res/$ds/$val" if test "$d" -ne 0 && test "$ret" -eq 0 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot index 2db39f298d18..fb61703690cb 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot @@ -2,3 +2,4 @@ rcutorture.torture_type=srcud rcupdate.rcu_self_test=1 rcutorture.fwd_progress=3 srcutree.big_cpu_lim=5 +rcutorture.reader_flavor=0x8 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot index c419cac233ee..54f5c9053474 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot @@ -2,3 +2,9 @@ rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 rcupdate.rcu_self_test=1 + +# This part is for synchronize_rcu() testing +rcutorture.nfakewriters=-1 +rcutorture.gp_sync=1 +rcupdate.rcu_normal=1 +rcutree.rcu_normal_wake_from_gp=1 diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 index d30922d8c883..352393bc5c56 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07 @@ -1,7 +1,8 @@ CONFIG_SMP=y CONFIG_NR_CPUS=16 -CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT_LAZY=y CONFIG_PREEMPT=n CONFIG_PREEMPT_DYNAMIC=n #CHECK#CONFIG_TREE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 index 759ee51d3ddc..420632b030dc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10 @@ -1,6 +1,7 @@ CONFIG_SMP=y CONFIG_NR_CPUS=74 -CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_LAZY=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n CONFIG_PREEMPT_DYNAMIC=n diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore index 16496de5f6ce..0fda241fa62b 100644 --- a/tools/testing/selftests/rseq/.gitignore +++ b/tools/testing/selftests/rseq/.gitignore @@ -9,3 +9,4 @@ param_test_compare_twice param_test_mm_cid param_test_mm_cid_benchmark param_test_mm_cid_compare_twice +syscall_errors_test diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile index 5a3432fceb58..0d0a5fae5954 100644 --- a/tools/testing/selftests/rseq/Makefile +++ b/tools/testing/selftests/rseq/Makefile @@ -16,11 +16,12 @@ OVERRIDE_TARGETS = 1 TEST_GEN_PROGS = basic_test basic_percpu_ops_test basic_percpu_ops_mm_cid_test param_test \ param_test_benchmark param_test_compare_twice param_test_mm_cid \ - param_test_mm_cid_benchmark param_test_mm_cid_compare_twice + param_test_mm_cid_benchmark param_test_mm_cid_compare_twice \ + syscall_errors_test TEST_GEN_PROGS_EXTENDED = librseq.so -TEST_PROGS = run_param_test.sh +TEST_PROGS = run_param_test.sh run_syscall_errors_test.sh TEST_FILES := settings @@ -54,3 +55,7 @@ $(OUTPUT)/param_test_mm_cid_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ $(OUTPUT)/param_test_mm_cid_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \ rseq.h rseq-*.h $(CC) $(CFLAGS) -DBUILDOPT_RSEQ_PERCPU_MM_CID -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@ + +$(OUTPUT)/syscall_errors_test: syscall_errors_test.c $(TEST_GEN_PROGS_EXTENDED) \ + rseq.h rseq-*.h + $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@ diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index f6156790c3b4..663a9cef1952 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -71,9 +71,20 @@ static int rseq_ownership; /* Original struct rseq allocation size is 32 bytes. */ #define ORIG_RSEQ_ALLOC_SIZE 32 +/* + * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an + * rseq registration that is larger than the current rseq ABI. + */ +union rseq_tls { + struct rseq_abi abi; + char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE]; +}; + static -__thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = { - .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, +__thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = { + .abi = { + .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, + }, }; static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len, @@ -87,7 +98,7 @@ static int sys_getcpu(unsigned *cpu, unsigned *node) return syscall(__NR_getcpu, cpu, node, NULL); } -int rseq_available(void) +bool rseq_available(void) { int rc; @@ -96,9 +107,9 @@ int rseq_available(void) abort(); switch (errno) { case ENOSYS: - return 0; + return false; case EINVAL: - return 1; + return true; default: abort(); } @@ -149,7 +160,7 @@ int rseq_register_current_thread(void) /* Treat libc's ownership as a successful registration. */ return 0; } - rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); if (rc) { /* * After at least one thread has registered successfully @@ -183,7 +194,7 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq.abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; @@ -249,7 +260,7 @@ void rseq_init(void) rseq_ownership = 1; /* Calculate the offset of the rseq area from the thread pointer. */ - rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer(); + rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer(); /* rseq flags are deprecated, always set to 0. */ rseq_flags = 0; diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index ba424ce80a71..f51a5fdb0444 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -160,6 +160,11 @@ int32_t rseq_fallback_current_cpu(void); int32_t rseq_fallback_current_node(void); /* + * Returns true if rseq is supported. + */ +bool rseq_available(void); + +/* * Values returned can be either the current CPU number, -1 (rseq is * uninitialized), or -2 (rseq initialization has failed). */ diff --git a/tools/testing/selftests/rseq/run_syscall_errors_test.sh b/tools/testing/selftests/rseq/run_syscall_errors_test.sh new file mode 100755 index 000000000000..9272246b39f2 --- /dev/null +++ b/tools/testing/selftests/rseq/run_syscall_errors_test.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# SPDX-License-Identifier: MIT +# SPDX-FileCopyrightText: 2024 Michael Jeanson <mjeanson@efficios.com> + +GLIBC_TUNABLES="${GLIBC_TUNABLES:-}:glibc.pthread.rseq=0" ./syscall_errors_test diff --git a/tools/testing/selftests/rseq/syscall_errors_test.c b/tools/testing/selftests/rseq/syscall_errors_test.c new file mode 100644 index 000000000000..a5d9e1f8a2dc --- /dev/null +++ b/tools/testing/selftests/rseq/syscall_errors_test.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2024 Michael Jeanson <mjeanson@efficios.com> + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <assert.h> +#include <stdint.h> +#include <syscall.h> +#include <string.h> +#include <unistd.h> + +#include "rseq.h" + +static int sys_rseq(void *rseq_abi, uint32_t rseq_len, + int flags, uint32_t sig) +{ + return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); +} + +/* + * Check the value of errno on some expected failures of the rseq syscall. + */ + +int main(void) +{ + struct rseq_abi *global_rseq = rseq_get_abi(); + int ret; + int errno_copy; + + if (!rseq_available()) { + fprintf(stderr, "rseq syscall unavailable"); + goto error; + } + + /* The current thread is NOT registered. */ + + /* EINVAL */ + errno = 0; + ret = sys_rseq(global_rseq, 32, -1, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid flag fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + errno = 0; + ret = sys_rseq((char *) global_rseq + 1, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with unaligned rseq_abi fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 31, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid size fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + +#if defined(__LP64__) && (!defined(__s390__) && !defined(__s390x__)) + /* + * We haven't found a reliable way to find an invalid address when + * running a 32bit userspace on a 64bit kernel, so only run this test + * on 64bit builds for the moment. + * + * Also exclude architectures that select + * CONFIG_ALTERNATE_USER_ADDRESS_SPACE where the kernel and userspace + * have their own address space and this failure can't happen. + */ + + /* EFAULT */ + errno = 0; + ret = sys_rseq((void *) -4096UL, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration with invalid address fails with errno set to EFAULT (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EFAULT) + goto error; +#endif + + errno = 0; + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Registration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret != 0 && errno != 0) + goto error; + + /* The current thread is registered. */ + + /* EBUSY */ + errno = 0; + ret = sys_rseq(global_rseq, 32, 0, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Double registration fails with errno set to EBUSY (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EBUSY) + goto error; + + /* EPERM */ + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG + 1); + errno_copy = errno; + fprintf(stderr, "Unregistration with wrong RSEQ_SIG fails with errno to EPERM (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EPERM) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Unregistration succeeds for the current thread (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret != 0) + goto error; + + errno = 0; + ret = sys_rseq(global_rseq, 32, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + errno_copy = errno; + fprintf(stderr, "Double unregistration fails with errno set to EINVAL (ret = %d, errno = %s)\n", ret, strerrorname_np(errno_copy)); + if (ret == 0 || errno_copy != EINVAL) + goto error; + + return 0; +error: + return -1; +} diff --git a/tools/testing/selftests/sched/config b/tools/testing/selftests/sched/config index e8b09aa7c0c4..1bb8bf6d7fd4 100644 --- a/tools/testing/selftests/sched/config +++ b/tools/testing/selftests/sched/config @@ -1 +1 @@ -CONFIG_SCHED_DEBUG=y +# empty diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 011762224600..f4531327b8e7 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -172,6 +172,7 @@ auto-test-targets := \ maximal \ maybe_null \ minimal \ + numa \ prog_run \ reload_loop \ select_cpu_dfl \ diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config index 0de9b4ee249d..aa901b05c8ad 100644 --- a/tools/testing/selftests/sched_ext/config +++ b/tools/testing/selftests/sched_ext/config @@ -1,4 +1,3 @@ -CONFIG_SCHED_DEBUG=y CONFIG_SCHED_CLASS_EXT=y CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c new file mode 100644 index 000000000000..a79d86ed54a1 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.bpf.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates the behavior of the NUMA-aware + * functionalities. + * + * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks + * are exclusively processed by CPUs within their respective nodes. Idle + * CPUs are selected only within the same node, so task migration can only + * occurs between CPUs belonging to the same node. + * + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE; + +static bool is_cpu_idle(s32 cpu, int node) +{ + const struct cpumask *idle_cpumask; + bool idle; + + idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node); + idle = bpf_cpumask_test_cpu(cpu, idle_cpumask); + scx_bpf_put_cpumask(idle_cpumask); + + return idle; +} + +s32 BPF_STRUCT_OPS(numa_select_cpu, + struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + s32 cpu; + + /* + * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here, + * since it already tries to pick an idle CPU within the node + * first, but let's use both functions for better testing coverage. + */ + cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + if (cpu < 0) + cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + + if (is_cpu_idle(cpu, node)) + scx_bpf_error("CPU %d should be marked as busy", cpu); + + if (__COMPAT_scx_bpf_cpu_node(cpu) != node) + scx_bpf_error("CPU %d should be in node %d", cpu, node); + + return cpu; +} + +void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + + scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev) +{ + int node = __COMPAT_scx_bpf_cpu_node(cpu); + + scx_bpf_dsq_move_to_local(node); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init) +{ + int node, err; + + bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) { + err = scx_bpf_create_dsq(node, node); + if (err) + return err; + } + + return 0; +} + +void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops numa_ops = { + .select_cpu = (void *)numa_select_cpu, + .enqueue = (void *)numa_enqueue, + .dispatch = (void *)numa_dispatch, + .init = (void *)numa_init, + .exit = (void *)numa_exit, + .name = "numa", +}; diff --git a/tools/testing/selftests/sched_ext/numa.c b/tools/testing/selftests/sched_ext/numa.c new file mode 100644 index 000000000000..b060c3b65c82 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "numa.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct numa *skel; + + skel = numa__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE; + skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE; + SCX_FAIL_IF(numa__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct numa *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.numa_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct numa *skel = ctx; + + numa__destroy(skel); +} + +struct scx_test numa = { + .name = "numa", + .description = "Verify NUMA-aware functionalities", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&numa) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 14ba51b52095..b2f76a52215a 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -155,6 +155,12 @@ struct seccomp_data { # endif #endif +#ifndef __NR_uretprobe +# if defined(__x86_64__) +# define __NR_uretprobe 335 +# endif +#endif + #ifndef SECCOMP_SET_MODE_STRICT #define SECCOMP_SET_MODE_STRICT 0 #endif diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 84472b436c07..db1616857d89 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -21,7 +21,7 @@ TEST_FILE=$(mktemp) # ENABLED: 1 if enabled, 0 otherwise # TARGET: test target file required on the test_sysctl module # SKIP_NO_TARGET: 1 skip if TARGET not there -# 0 run eventhough TARGET not there +# 0 run even though TARGET not there # # Once these are enabled please leave them as-is. Write your own test, # we have tons of space. @@ -764,7 +764,7 @@ sysctl_test_0007() fi if [ ! -f /proc/cmdline ]; then - echo -e "SKIPPING\nThere is no /proc/cmdline to check for paramter" + echo -e "SKIPPING\nThere is no /proc/cmdline to check for parameter" return $ksft_skip fi @@ -857,7 +857,7 @@ list_tests() echo echo "TEST_ID x NUM_TEST" echo "TEST_ID: Test ID" - echo "NUM_TESTS: Number of recommended times to run the test" + echo "NUM_TESTS: Recommended number of times to run the test" echo echo "0001 x $(get_test_count 0001) - tests proc_dointvec_minmax()" echo "0002 x $(get_test_count 0002) - tests proc_dostring()" @@ -884,7 +884,7 @@ usage() echo "Valid tests: 0001-$MAX_TEST" echo "" echo " all Runs all tests (default)" - echo " -t Run test ID the number amount of times is recommended" + echo " -t Run test ID the recommended number of times" echo " -w Watch test ID run until it runs into an error" echo " -c Run test ID once" echo " -s Run test ID x test-count number of times" @@ -898,7 +898,7 @@ usage() echo Example uses: echo echo "$TEST_NAME.sh -- executes all tests" - echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 number of times is recomended" + echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 the recommended number of times" echo "$TEST_NAME.sh -w 0002 -- Watch test ID 0002 run until an error occurs" echo "$TEST_NAME.sh -s 0002 -- Run test ID 0002 once" echo "$TEST_NAME.sh -c 0002 3 -- Run test ID 0002 three times" diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json index dd8109768f8f..5596f4df0e9f 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json @@ -689,7 +689,7 @@ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m continue index 1", "expExitCode": "0", "verifyCmd": "$TC actions get action police index 1", - "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action continue", + "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst (1024Kb|1Mb) mtu 2Kb action continue", "matchCount": "1", "teardown": [ "$TC actions flush action police" @@ -716,7 +716,7 @@ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m drop index 1", "expExitCode": "0", "verifyCmd": "$TC actions ls action police", - "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action drop", + "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst (1024Kb|1Mb) mtu 2Kb action drop", "matchCount": "1", "teardown": [ "$TC actions flush action police" @@ -743,7 +743,7 @@ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m ok index 1", "expExitCode": "0", "verifyCmd": "$TC actions ls action police", - "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action pass", + "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst (1024Kb|1Mb) mtu 2Kb action pass", "matchCount": "1", "teardown": [ "$TC actions flush action police" @@ -770,7 +770,7 @@ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m reclassify index 1", "expExitCode": "0", "verifyCmd": "$TC actions get action police index 1", - "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action reclassify", + "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst (1024Kb|1Mb) mtu 2Kb action reclassify", "matchCount": "1", "teardown": [ "$TC actions flush action police" @@ -797,7 +797,7 @@ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m pipe index 1", "expExitCode": "0", "verifyCmd": "$TC actions ls action police", - "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action pipe", + "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst (1024Kb|1Mb) mtu 2Kb action pipe", "matchCount": "1", "teardown": [ "$TC actions flush action police" diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 9814b3a1c77d..f0eceb0faf34 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -7,6 +7,7 @@ * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com> */ #define _GNU_SOURCE +#include <sys/prctl.h> #include <sys/time.h> #include <sys/types.h> #include <stdio.h> @@ -599,14 +600,84 @@ static void check_overrun(int which, const char *name) "check_overrun %s\n", name); } +#include <sys/syscall.h> + +static int do_timer_create(int *id) +{ + return syscall(__NR_timer_create, CLOCK_MONOTONIC, NULL, id); +} + +static int do_timer_delete(int id) +{ + return syscall(__NR_timer_delete, id); +} + +#ifndef PR_TIMER_CREATE_RESTORE_IDS +# define PR_TIMER_CREATE_RESTORE_IDS 77 +# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0 +# define PR_TIMER_CREATE_RESTORE_IDS_ON 1 +# define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +#endif + +static void check_timer_create_exact(void) +{ + int id; + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON, 0, 0, 0)) { + switch (errno) { + case EINVAL: + ksft_test_result_skip("check timer create exact, not supported\n"); + return; + default: + ksft_test_result_skip("check timer create exact, errno = %d\n", errno); + return; + } + } + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 1) + fatal_error(NULL, "prctl(GET) failed\n"); + + id = 8; + if (do_timer_create(&id) < 0) + fatal_error(NULL, "timer_create()"); + + if (do_timer_delete(id)) + fatal_error(NULL, "timer_delete()"); + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF, 0, 0, 0)) + fatal_error(NULL, "prctl(OFF)"); + + if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_GET, 0, 0, 0) != 0) + fatal_error(NULL, "prctl(GET) failed\n"); + + if (id != 8) { + ksft_test_result_fail("check timer create exact %d != 8\n", id); + return; + } + + /* Validate that it went back to normal mode and allocates ID 9 */ + if (do_timer_create(&id) < 0) + fatal_error(NULL, "timer_create()"); + + if (do_timer_delete(id)) + fatal_error(NULL, "timer_delete()"); + + if (id == 9) + ksft_test_result_pass("check timer create exact\n"); + else + ksft_test_result_fail("check timer create exact. Disabling failed.\n"); +} + int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(18); + ksft_set_plan(19); ksft_print_msg("Testing posix timers. False negative may happen on CPU execution \n"); ksft_print_msg("based timers if other threads run on the CPU...\n"); + check_timer_create_exact(); + check_itimer(ITIMER_VIRTUAL, "ITIMER_VIRTUAL"); check_itimer(ITIMER_PROF, "ITIMER_PROF"); check_itimer(ITIMER_REAL, "ITIMER_REAL"); diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c index 83450145fe65..46c391d7f45d 100644 --- a/tools/testing/selftests/timers/skew_consistency.c +++ b/tools/testing/selftests/timers/skew_consistency.c @@ -47,7 +47,7 @@ int main(int argc, char **argv) pid = fork(); if (!pid) - return system("./inconsistency-check -c 1 -t 600"); + return system("./inconsistency-check -t 60"); ppm = 500; ret = 0; diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore new file mode 100644 index 000000000000..8b2871ea7751 --- /dev/null +++ b/tools/testing/selftests/ublk/.gitignore @@ -0,0 +1,3 @@ +kublk +/tools +*-verify.state diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile new file mode 100644 index 000000000000..7817afe29005 --- /dev/null +++ b/tools/testing/selftests/ublk/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) +LDLIBS += -lpthread -lm -luring + +TEST_PROGS := test_generic_01.sh + +TEST_PROGS += test_null_01.sh +TEST_PROGS += test_null_02.sh +TEST_PROGS += test_loop_01.sh +TEST_PROGS += test_loop_02.sh +TEST_PROGS += test_loop_03.sh +TEST_PROGS += test_loop_04.sh +TEST_PROGS += test_stripe_01.sh +TEST_PROGS += test_stripe_02.sh + +TEST_PROGS += test_stress_01.sh +TEST_PROGS += test_stress_02.sh + +TEST_GEN_PROGS_EXTENDED = kublk + +include ../lib.mk + +$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c + +check: + shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c new file mode 100644 index 000000000000..01580a6f8519 --- /dev/null +++ b/tools/testing/selftests/ublk/common.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +void backing_file_tgt_deinit(struct ublk_dev *dev) +{ + int i; + + for (i = 1; i < dev->nr_fds; i++) { + fsync(dev->fds[i]); + close(dev->fds[i]); + } +} + +int backing_file_tgt_init(struct ublk_dev *dev) +{ + int fd, i; + + assert(dev->nr_fds == 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + char *file = dev->tgt.backing_file[i]; + unsigned long bytes; + struct stat st; + + ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); + + fd = open(file, O_RDWR | O_DIRECT); + if (fd < 0) { + ublk_err("%s: backing file %s can't be opened: %s\n", + __func__, file, strerror(errno)); + return -EBADF; + } + + if (fstat(fd, &st) < 0) { + close(fd); + return -EBADF; + } + + if (S_ISREG(st.st_mode)) + bytes = st.st_size; + else if (S_ISBLK(st.st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) + return -1; + } else { + return -EINVAL; + } + + dev->tgt.backing_file_size[i] = bytes; + dev->fds[dev->nr_fds] = fd; + dev->nr_fds += 1; + } + + return 0; +} diff --git a/tools/testing/selftests/ublk/config b/tools/testing/selftests/ublk/config new file mode 100644 index 000000000000..592b0ba4d661 --- /dev/null +++ b/tools/testing/selftests/ublk/config @@ -0,0 +1 @@ +CONFIG_BLK_DEV_UBLK=m diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c new file mode 100644 index 000000000000..6f34eabfae97 --- /dev/null +++ b/tools/testing/selftests/ublk/file_backed.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int zc) +{ + unsigned ublk_op = ublksrv_get_op(iod); + + if (ublk_op == UBLK_IO_OP_READ) + return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; + else if (ublk_op == UBLK_IO_OP_WRITE) + return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; + assert(0); +} + +static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + struct io_uring_sqe *sqe[1]; + + ublk_queue_alloc_sqes(q, sqe, 1); + io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); + return 1; +} + +static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + int zc = ublk_queue_use_zc(q); + enum io_uring_op op = ublk_to_uring_op(iod, zc); + struct io_uring_sqe *sqe[3]; + + if (!zc) { + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) + return -ENOMEM; + + io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/, + (void *)iod->addr, + iod->nr_sectors << 9, + iod->start_sector << 9); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); + return 1; + } + + ublk_queue_alloc_sqes(q, sqe, 3); + + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + + io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, + iod->nr_sectors << 9, + iod->start_sector << 9); + sqe[1]->buf_index = tag; + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); + + return 2; +} + +static int loop_queue_tgt_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + int ret; + + switch (ublk_op) { + case UBLK_IO_OP_FLUSH: + ret = loop_queue_flush_io(q, iod, tag); + break; + case UBLK_IO_OP_WRITE_ZEROES: + case UBLK_IO_OP_DISCARD: + ret = -ENOTSUP; + break; + case UBLK_IO_OP_READ: + case UBLK_IO_OP_WRITE: + ret = loop_queue_tgt_rw_io(q, iod, tag); + break; + default: + ret = -EINVAL; + break; + } + + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, + iod->op_flags, iod->start_sector, iod->nr_sectors << 9); + return ret; +} + +static int ublk_loop_queue_io(struct ublk_queue *q, int tag) +{ + int queued = loop_queue_tgt_io(q, tag); + + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +static void ublk_loop_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + if (!io->result) + io->result = cqe->res; + if (cqe->res < 0) + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } + + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + + if (ublk_completed_tgt_io(q, tag)) + ublk_complete_io(q, tag, io->result); +} + +static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + unsigned long long bytes; + int ret; + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, + .basic = { + .attrs = UBLK_ATTR_VOLATILE_CACHE, + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + .dma = { + .alignment = 511, + }, + }; + + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + if (dev->tgt.nr_backing_files != 1) + return -EINVAL; + + bytes = dev->tgt.backing_file_size[0]; + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + + return 0; +} + +const struct ublk_tgt_ops loop_tgt_ops = { + .name = "loop", + .init_tgt = ublk_loop_tgt_init, + .deinit_tgt = backing_file_tgt_deinit, + .queue_io = ublk_loop_queue_io, + .tgt_io_done = ublk_loop_io_done, +}; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c new file mode 100644 index 000000000000..05147b53c361 --- /dev/null +++ b/tools/testing/selftests/ublk/kublk.c @@ -0,0 +1,1138 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: uring_cmd based ublk + */ + +#include "kublk.h" + +unsigned int ublk_dbg_mask = UBLK_LOG; +static const struct ublk_tgt_ops *tgt_ops_list[] = { + &null_tgt_ops, + &loop_tgt_ops, + &stripe_tgt_ops, +}; + +static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) +{ + const struct ublk_tgt_ops *ops; + int i; + + if (name == NULL) + return NULL; + + for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++) + if (strcmp(tgt_ops_list[i]->name, name) == 0) + return tgt_ops_list[i]; + return NULL; +} + +static inline int ublk_setup_ring(struct io_uring *r, int depth, + int cq_depth, unsigned flags) +{ + struct io_uring_params p; + + memset(&p, 0, sizeof(p)); + p.flags = flags | IORING_SETUP_CQSIZE; + p.cq_entries = cq_depth; + + return io_uring_queue_init_params(depth, r, &p); +} + +static void ublk_ctrl_init_cmd(struct ublk_dev *dev, + struct io_uring_sqe *sqe, + struct ublk_ctrl_cmd_data *data) +{ + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); + + sqe->fd = dev->ctrl_fd; + sqe->opcode = IORING_OP_URING_CMD; + sqe->ioprio = 0; + + if (data->flags & CTRL_CMD_HAS_BUF) { + cmd->addr = data->addr; + cmd->len = data->len; + } + + if (data->flags & CTRL_CMD_HAS_DATA) + cmd->data[0] = data->data[0]; + + cmd->dev_id = info->dev_id; + cmd->queue_id = -1; + + ublk_set_sqe_cmd_op(sqe, data->cmd_op); + + io_uring_sqe_set_data(sqe, cmd); +} + +static int __ublk_ctrl_cmd(struct ublk_dev *dev, + struct ublk_ctrl_cmd_data *data) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + int ret = -EINVAL; + + sqe = io_uring_get_sqe(&dev->ring); + if (!sqe) { + ublk_err("%s: can't get sqe ret %d\n", __func__, ret); + return ret; + } + + ublk_ctrl_init_cmd(dev, sqe, data); + + ret = io_uring_submit(&dev->ring); + if (ret < 0) { + ublk_err("uring submit ret %d\n", ret); + return ret; + } + + ret = io_uring_wait_cqe(&dev->ring, &cqe); + if (ret < 0) { + ublk_err("wait cqe: %s\n", strerror(-ret)); + return ret; + } + io_uring_cqe_seen(&dev->ring, cqe); + + return cqe->res; +} + +static int ublk_ctrl_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_CMD_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_start_dev(struct ublk_dev *dev, + int daemon_pid) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_START_DEV, + .flags = CTRL_CMD_HAS_DATA, + }; + + dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_add_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_ADD_DEV, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) &dev->dev_info, + .len = sizeof(struct ublksrv_ctrl_dev_info), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_del_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_DEL_DEV, + .flags = 0, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_info(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_DEV_INFO, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) &dev->dev_info, + .len = sizeof(struct ublksrv_ctrl_dev_info), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_set_params(struct ublk_dev *dev, + struct ublk_params *params) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_SET_PARAMS, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) params, + .len = sizeof(*params), + }; + params->len = sizeof(*params); + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_params(struct ublk_dev *dev, + struct ublk_params *params) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_CMD_GET_PARAMS, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64)params, + .len = sizeof(*params), + }; + + params->len = sizeof(*params); + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_features(struct ublk_dev *dev, + __u64 *features) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_FEATURES, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) features, + .len = sizeof(*features), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static const char *ublk_dev_state_desc(struct ublk_dev *dev) +{ + switch (dev->dev_info.state) { + case UBLK_S_DEV_DEAD: + return "DEAD"; + case UBLK_S_DEV_LIVE: + return "LIVE"; + case UBLK_S_DEV_QUIESCED: + return "QUIESCED"; + default: + return "UNKNOWN"; + }; +} + +static void ublk_ctrl_dump(struct ublk_dev *dev) +{ + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + struct ublk_params p; + int ret; + + ret = ublk_ctrl_get_params(dev, &p); + if (ret < 0) { + ublk_err("failed to get params %m\n"); + return; + } + + ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", + info->dev_id, info->nr_hw_queues, info->queue_depth, + 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); + ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", + info->max_io_buf_bytes, info->ublksrv_pid, info->flags, + ublk_dev_state_desc(dev)); + fflush(stdout); +} + +static void ublk_ctrl_deinit(struct ublk_dev *dev) +{ + close(dev->ctrl_fd); + free(dev); +} + +static struct ublk_dev *ublk_ctrl_init(void) +{ + struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + int ret; + + dev->ctrl_fd = open(CTRL_DEV, O_RDWR); + if (dev->ctrl_fd < 0) { + free(dev); + return NULL; + } + + info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; + + ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, + UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); + if (ret < 0) { + ublk_err("queue_init: %s\n", strerror(-ret)); + free(dev); + return NULL; + } + dev->nr_fds = 1; + + return dev; +} + +static int __ublk_queue_cmd_buf_sz(unsigned depth) +{ + int size = depth * sizeof(struct ublksrv_io_desc); + unsigned int page_sz = getpagesize(); + + return round_up(size, page_sz); +} + +static int ublk_queue_max_cmd_buf_sz(void) +{ + return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); +} + +static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) +{ + return __ublk_queue_cmd_buf_sz(q->q_depth); +} + +static void ublk_queue_deinit(struct ublk_queue *q) +{ + int i; + int nr_ios = q->q_depth; + + io_uring_unregister_buffers(&q->ring); + + io_uring_unregister_ring_fd(&q->ring); + + if (q->ring.ring_fd > 0) { + io_uring_unregister_files(&q->ring); + close(q->ring.ring_fd); + q->ring.ring_fd = -1; + } + + if (q->io_cmd_buf) + munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); + + for (i = 0; i < nr_ios; i++) + free(q->ios[i].buf_addr); +} + +static int ublk_queue_init(struct ublk_queue *q) +{ + struct ublk_dev *dev = q->dev; + int depth = dev->dev_info.queue_depth; + int i, ret = -1; + int cmd_buf_size, io_buf_size; + unsigned long off; + int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; + + q->tgt_ops = dev->tgt.ops; + q->state = 0; + q->q_depth = depth; + q->cmd_inflight = 0; + q->tid = gettid(); + + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + q->state |= UBLKSRV_NO_BUF; + q->state |= UBLKSRV_ZC; + } + + cmd_buf_size = ublk_queue_cmd_buf_sz(q); + off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); + q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, + MAP_SHARED | MAP_POPULATE, dev->fds[0], off); + if (q->io_cmd_buf == MAP_FAILED) { + ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", + q->dev->dev_info.dev_id, q->q_id); + goto fail; + } + + io_buf_size = dev->dev_info.max_io_buf_bytes; + for (i = 0; i < q->q_depth; i++) { + q->ios[i].buf_addr = NULL; + q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; + + if (q->state & UBLKSRV_NO_BUF) + continue; + + if (posix_memalign((void **)&q->ios[i].buf_addr, + getpagesize(), io_buf_size)) { + ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", + dev->dev_info.dev_id, q->q_id, i); + goto fail; + } + } + + ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth, + IORING_SETUP_COOP_TASKRUN); + if (ret < 0) { + ublk_err("ublk dev %d queue %d setup io_uring failed %d\n", + q->dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); + if (ret) { + ublk_err("ublk dev %d queue %d register spare buffers failed %d", + dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + } + + io_uring_register_ring_fd(&q->ring); + + ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); + if (ret) { + ublk_err("ublk dev %d queue %d register files failed %d\n", + q->dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + + return 0; + fail: + ublk_queue_deinit(q); + ublk_err("ublk dev %d queue %d failed\n", + dev->dev_info.dev_id, q->q_id); + return -ENOMEM; +} + +#define WAIT_USEC 100000 +#define MAX_WAIT_USEC (3 * 1000000) +static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + int dev_id = dev->dev_info.dev_id; + unsigned int wait_usec = 0; + int ret = 0, fd = -1; + char buf[64]; + + snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); + + while (wait_usec < MAX_WAIT_USEC) { + fd = open(buf, O_RDWR); + if (fd >= 0) + break; + usleep(WAIT_USEC); + wait_usec += WAIT_USEC; + } + if (fd < 0) { + ublk_err("can't open %s %s\n", buf, strerror(errno)); + return -1; + } + + dev->fds[0] = fd; + if (dev->tgt.ops->init_tgt) + ret = dev->tgt.ops->init_tgt(ctx, dev); + if (ret) + close(dev->fds[0]); + return ret; +} + +static void ublk_dev_unprep(struct ublk_dev *dev) +{ + if (dev->tgt.ops->deinit_tgt) + dev->tgt.ops->deinit_tgt(dev); + close(dev->fds[0]); +} + +int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) +{ + struct ublksrv_io_cmd *cmd; + struct io_uring_sqe *sqe[1]; + unsigned int cmd_op = 0; + __u64 user_data; + + /* only freed io can be issued */ + if (!(io->flags & UBLKSRV_IO_FREE)) + return 0; + + /* we issue because we need either fetching or committing */ + if (!(io->flags & + (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP))) + return 0; + + if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) + cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; + else if (io->flags & UBLKSRV_NEED_FETCH_RQ) + cmd_op = UBLK_U_IO_FETCH_REQ; + + if (io_uring_sq_space_left(&q->ring) < 1) + io_uring_submit(&q->ring); + + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) { + ublk_err("%s: run out of sqe %d, tag %d\n", + __func__, q->q_id, tag); + return -1; + } + + cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]); + + if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) + cmd->result = io->result; + + /* These fields should be written once, never change */ + ublk_set_sqe_cmd_op(sqe[0], cmd_op); + sqe[0]->fd = 0; /* dev->fds[0] */ + sqe[0]->opcode = IORING_OP_URING_CMD; + sqe[0]->flags = IOSQE_FIXED_FILE; + sqe[0]->rw_flags = 0; + cmd->tag = tag; + cmd->q_id = q->q_id; + if (!(q->state & UBLKSRV_NO_BUF)) + cmd->addr = (__u64) (uintptr_t) io->buf_addr; + else + cmd->addr = 0; + + user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); + io_uring_sqe_set_data64(sqe[0], user_data); + + io->flags = 0; + + q->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n", + __func__, q->q_id, tag, cmd_op, + io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING)); + return 1; +} + +static void ublk_submit_fetch_commands(struct ublk_queue *q) +{ + int i = 0; + + for (i = 0; i < q->q_depth; i++) + ublk_queue_io_cmd(q, &q->ios[i], i); +} + +static int ublk_queue_is_idle(struct ublk_queue *q) +{ + return !io_uring_sq_ready(&q->ring) && !q->io_inflight; +} + +static int ublk_queue_is_done(struct ublk_queue *q) +{ + return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q); +} + +static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, + struct io_uring_cqe *cqe) +{ + unsigned tag = user_data_to_tag(cqe->user_data); + + if (cqe->res < 0 && cqe->res != -EAGAIN) + ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", + __func__, cqe->res, q->q_id, + user_data_to_tag(cqe->user_data), + user_data_to_op(cqe->user_data)); + + if (q->tgt_ops->tgt_io_done) + q->tgt_ops->tgt_io_done(q, tag, cqe); +} + +static void ublk_handle_cqe(struct io_uring *r, + struct io_uring_cqe *cqe, void *data) +{ + struct ublk_queue *q = container_of(r, struct ublk_queue, ring); + unsigned tag = user_data_to_tag(cqe->user_data); + unsigned cmd_op = user_data_to_op(cqe->user_data); + int fetch = (cqe->res != UBLK_IO_RES_ABORT) && + !(q->state & UBLKSRV_QUEUE_STOPPING); + struct ublk_io *io; + + if (cqe->res < 0 && cqe->res != -ENODEV) + ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, + cqe->res, cqe->user_data, q->state); + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", + __func__, cqe->res, q->q_id, tag, cmd_op, + is_target_io(cqe->user_data), + user_data_to_tgt_data(cqe->user_data), + (q->state & UBLKSRV_QUEUE_STOPPING)); + + /* Don't retrieve io in case of target io */ + if (is_target_io(cqe->user_data)) { + ublksrv_handle_tgt_cqe(q, cqe); + return; + } + + io = &q->ios[tag]; + q->cmd_inflight--; + + if (!fetch) { + q->state |= UBLKSRV_QUEUE_STOPPING; + io->flags &= ~UBLKSRV_NEED_FETCH_RQ; + } + + if (cqe->res == UBLK_IO_RES_OK) { + assert(tag < q->q_depth); + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(q, tag); + } else { + /* + * COMMIT_REQ will be completed immediately since no fetching + * piggyback is required. + * + * Marking IO_FREE only, then this io won't be issued since + * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) + * + * */ + io->flags = UBLKSRV_IO_FREE; + } +} + +static int ublk_reap_events_uring(struct io_uring *r) +{ + struct io_uring_cqe *cqe; + unsigned head; + int count = 0; + + io_uring_for_each_cqe(r, head, cqe) { + ublk_handle_cqe(r, cqe, NULL); + count += 1; + } + io_uring_cq_advance(r, count); + + return count; +} + +static int ublk_process_io(struct ublk_queue *q) +{ + int ret, reapped; + + ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n", + q->dev->dev_info.dev_id, + q->q_id, io_uring_sq_ready(&q->ring), + q->cmd_inflight, + (q->state & UBLKSRV_QUEUE_STOPPING)); + + if (ublk_queue_is_done(q)) + return -ENODEV; + + ret = io_uring_submit_and_wait(&q->ring, 1); + reapped = ublk_reap_events_uring(&q->ring); + + ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n", + ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING), + (q->state & UBLKSRV_QUEUE_IDLE)); + + return reapped; +} + +static void *ublk_io_handler_fn(void *data) +{ + struct ublk_queue *q = data; + int dev_id = q->dev->dev_info.dev_id; + int ret; + + ret = ublk_queue_init(q); + if (ret) { + ublk_err("ublk dev %d queue %d init queue failed\n", + dev_id, q->q_id); + return NULL; + } + ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n", + q->tid, dev_id, q->q_id); + + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(q); + do { + if (ublk_process_io(q) < 0) + break; + } while (1); + + ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id); + ublk_queue_deinit(q); + return NULL; +} + +static void ublk_set_parameters(struct ublk_dev *dev) +{ + int ret; + + ret = ublk_ctrl_set_params(dev, &dev->tgt.params); + if (ret) + ublk_err("dev %d set basic parameter failed %d\n", + dev->dev_info.dev_id, ret); +} + +static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) +{ + uint64_t id; + int evtfd = ctx->_evtfd; + + if (evtfd < 0) + return -EBADF; + + if (dev_id >= 0) + id = dev_id + 1; + else + id = ERROR_EVTFD_DEVID; + + if (write(evtfd, &id, sizeof(id)) != sizeof(id)) + return -EINVAL; + + return 0; +} + + +static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + int ret, i; + void *thread_ret; + const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + + ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); + + ret = ublk_dev_prep(ctx, dev); + if (ret) + return ret; + + for (i = 0; i < dinfo->nr_hw_queues; i++) { + dev->q[i].dev = dev; + dev->q[i].q_id = i; + pthread_create(&dev->q[i].thread, NULL, + ublk_io_handler_fn, + &dev->q[i]); + } + + /* everything is fine now, start us */ + ublk_set_parameters(dev); + ret = ublk_ctrl_start_dev(dev, getpid()); + if (ret < 0) { + ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); + goto fail; + } + + ublk_ctrl_get_info(dev); + if (ctx->fg) + ublk_ctrl_dump(dev); + else + ublk_send_dev_event(ctx, dev->dev_info.dev_id); + + /* wait until we are terminated */ + for (i = 0; i < dinfo->nr_hw_queues; i++) + pthread_join(dev->q[i].thread, &thread_ret); + fail: + ublk_dev_unprep(dev); + ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); + + return ret; +} + +static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout) +{ +#define EV_SIZE (sizeof(struct inotify_event)) +#define EV_BUF_LEN (128 * (EV_SIZE + 16)) + struct pollfd pfd; + int fd, wd; + int ret = -EINVAL; + const char *dev_name = basename(path); + + fd = inotify_init(); + if (fd < 0) { + ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); + return fd; + } + + wd = inotify_add_watch(fd, "/dev", evt_mask); + if (wd == -1) { + ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); + goto fail; + } + + pfd.fd = fd; + pfd.events = POLL_IN; + while (1) { + int i = 0; + char buffer[EV_BUF_LEN]; + ret = poll(&pfd, 1, 1000 * timeout); + + if (ret == -1) { + ublk_err("%s: poll inotify failed: %d\n", __func__, ret); + goto rm_watch; + } else if (ret == 0) { + ublk_err("%s: poll inotify timeout\n", __func__); + ret = -ETIMEDOUT; + goto rm_watch; + } + + ret = read(fd, buffer, EV_BUF_LEN); + if (ret < 0) { + ublk_err("%s: read inotify fd failed\n", __func__); + goto rm_watch; + } + + while (i < ret) { + struct inotify_event *event = (struct inotify_event *)&buffer[i]; + + ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", + __func__, event->mask, event->name); + if (event->mask & evt_mask) { + if (!strcmp(event->name, dev_name)) { + ret = 0; + goto rm_watch; + } + } + i += EV_SIZE + event->len; + } + } +rm_watch: + inotify_rm_watch(fd, wd); +fail: + close(fd); + return ret; +} + +static int ublk_stop_io_daemon(const struct ublk_dev *dev) +{ + int daemon_pid = dev->dev_info.ublksrv_pid; + int dev_id = dev->dev_info.dev_id; + char ublkc[64]; + int ret = 0; + + if (daemon_pid < 0) + return 0; + + /* daemon may be dead already */ + if (kill(daemon_pid, 0) < 0) + goto wait; + + snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id); + + /* ublk char device may be gone already */ + if (access(ublkc, F_OK) != 0) + goto wait; + + /* Wait until ublk char device is closed, when the daemon is shutdown */ + ret = wait_ublk_dev(ublkc, IN_CLOSE, 10); + /* double check and since it may be closed before starting inotify */ + if (ret == -ETIMEDOUT) + ret = kill(daemon_pid, 0) < 0; +wait: + waitpid(daemon_pid, NULL, 0); + ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", + __func__, daemon_pid, dev_id, ret); + + return ret; +} + +static int __cmd_dev_add(const struct dev_ctx *ctx) +{ + unsigned nr_queues = ctx->nr_hw_queues; + const char *tgt_type = ctx->tgt_type; + unsigned depth = ctx->queue_depth; + __u64 features; + const struct ublk_tgt_ops *ops; + struct ublksrv_ctrl_dev_info *info; + struct ublk_dev *dev; + int dev_id = ctx->dev_id; + int ret, i; + + ops = ublk_find_tgt(tgt_type); + if (!ops) { + ublk_err("%s: no such tgt type, type %s\n", + __func__, tgt_type); + return -ENODEV; + } + + if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { + ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", + __func__, nr_queues, depth); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + if (!dev) { + ublk_err("%s: can't alloc dev id %d, type %s\n", + __func__, dev_id, tgt_type); + return -ENOMEM; + } + + /* kernel doesn't support get_features */ + ret = ublk_ctrl_get_features(dev, &features); + if (ret < 0) + return -EINVAL; + + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) + return -ENOTSUP; + + info = &dev->dev_info; + info->dev_id = ctx->dev_id; + info->nr_hw_queues = nr_queues; + info->queue_depth = depth; + info->flags = ctx->flags; + dev->tgt.ops = ops; + dev->tgt.sq_depth = depth; + dev->tgt.cq_depth = depth; + + for (i = 0; i < MAX_BACK_FILES; i++) { + if (ctx->files[i]) { + strcpy(dev->tgt.backing_file[i], ctx->files[i]); + dev->tgt.nr_backing_files++; + } + } + + ret = ublk_ctrl_add_dev(dev); + if (ret < 0) { + ublk_err("%s: can't add dev id %d, type %s ret %d\n", + __func__, dev_id, tgt_type, ret); + goto fail; + } + + ret = ublk_start_daemon(ctx, dev); + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + if (ret < 0) + ublk_ctrl_del_dev(dev); + +fail: + if (ret < 0) + ublk_send_dev_event(ctx, -1); + ublk_ctrl_deinit(dev); + return ret; +} + +static int __cmd_dev_list(struct dev_ctx *ctx); + +static int cmd_dev_add(struct dev_ctx *ctx) +{ + int res; + + if (ctx->fg) + goto run; + + ctx->_evtfd = eventfd(0, 0); + if (ctx->_evtfd < 0) { + ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); + exit(-1); + } + + setsid(); + res = fork(); + if (res == 0) { +run: + res = __cmd_dev_add(ctx); + return res; + } else if (res > 0) { + uint64_t id; + + res = read(ctx->_evtfd, &id, sizeof(id)); + close(ctx->_evtfd); + if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { + ctx->dev_id = id - 1; + return __cmd_dev_list(ctx); + } + exit(EXIT_FAILURE); + } else { + return res; + } +} + +static int __cmd_dev_del(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); + + ret = ublk_stop_io_daemon(dev); + if (ret < 0) + ublk_err("%s: stop daemon id %d dev %d, ret %d\n", + __func__, dev->dev_info.ublksrv_pid, number, ret); + ublk_ctrl_del_dev(dev); +fail: + ublk_ctrl_deinit(dev); + + return (ret >= 0) ? 0 : ret; +} + +static int cmd_dev_del(struct dev_ctx *ctx) +{ + int i; + + if (ctx->dev_id >= 0 || !ctx->all) + return __cmd_dev_del(ctx); + + for (i = 0; i < 255; i++) { + ctx->dev_id = i; + __cmd_dev_del(ctx); + } + return 0; +} + +static int __cmd_dev_list(struct dev_ctx *ctx) +{ + struct ublk_dev *dev = ublk_ctrl_init(); + int ret; + + if (!dev) + return -ENODEV; + + dev->dev_info.dev_id = ctx->dev_id; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) { + if (ctx->logging) + ublk_err("%s: can't get dev info from %d: %d\n", + __func__, ctx->dev_id, ret); + } else { + ublk_ctrl_dump(dev); + } + + ublk_ctrl_deinit(dev); + + return ret; +} + +static int cmd_dev_list(struct dev_ctx *ctx) +{ + int i; + + if (ctx->dev_id >= 0 || !ctx->all) + return __cmd_dev_list(ctx); + + ctx->logging = false; + for (i = 0; i < 255; i++) { + ctx->dev_id = i; + __cmd_dev_list(ctx); + } + return 0; +} + +static int cmd_dev_get_features(void) +{ +#define const_ilog2(x) (63 - __builtin_clzll(x)) + static const char *feat_map[] = { + [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", + [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", + [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", + [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", + [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", + [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", + [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", + [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", + [const_ilog2(UBLK_F_ZONED)] = "ZONED", + [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", + }; + struct ublk_dev *dev; + __u64 features = 0; + int ret; + + dev = ublk_ctrl_init(); + if (!dev) { + fprintf(stderr, "ublksrv_ctrl_init failed id\n"); + return -EOPNOTSUPP; + } + + ret = ublk_ctrl_get_features(dev, &features); + if (!ret) { + int i; + + printf("ublk_drv features: 0x%llx\n", features); + + for (i = 0; i < sizeof(features) * 8; i++) { + const char *feat; + + if (!((1ULL << i) & features)) + continue; + if (i < sizeof(feat_map) / sizeof(feat_map[0])) + feat = feat_map[i]; + else + feat = "unknown"; + printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); + } + } + + return ret; +} + +static int cmd_dev_help(char *exe) +{ + printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe); + printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); + printf("%s del [-n dev_id] -a \n", exe); + printf("\t -a delete all devices -n delete specified device\n"); + printf("%s list [-n dev_id] -a \n", exe); + printf("\t -a list all devices, -n list specified device, default -a \n"); + printf("%s features\n", exe); + return 0; +} + +int main(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "all", 0, NULL, 'a' }, + { "type", 1, NULL, 't' }, + { "number", 1, NULL, 'n' }, + { "queues", 1, NULL, 'q' }, + { "depth", 1, NULL, 'd' }, + { "debug_mask", 1, NULL, 0 }, + { "quiet", 0, NULL, 0 }, + { "zero_copy", 0, NULL, 'z' }, + { "foreground", 0, NULL, 0 }, + { "chunk_size", 1, NULL, 0 }, + { 0, 0, 0, 0 } + }; + int option_idx, opt; + const char *cmd = argv[1]; + struct dev_ctx ctx = { + .queue_depth = 128, + .nr_hw_queues = 2, + .dev_id = -1, + .tgt_type = "unknown", + .chunk_size = 65536, /* def chunk size is 64K */ + }; + int ret = -EINVAL, i; + + if (argc == 1) + return ret; + + optind = 2; + while ((opt = getopt_long(argc, argv, "t:n:d:q:az", + longopts, &option_idx)) != -1) { + switch (opt) { + case 'a': + ctx.all = 1; + break; + case 'n': + ctx.dev_id = strtol(optarg, NULL, 10); + break; + case 't': + if (strlen(optarg) < sizeof(ctx.tgt_type)) + strcpy(ctx.tgt_type, optarg); + break; + case 'q': + ctx.nr_hw_queues = strtol(optarg, NULL, 10); + break; + case 'd': + ctx.queue_depth = strtol(optarg, NULL, 10); + break; + case 'z': + ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; + break; + case 0: + if (!strcmp(longopts[option_idx].name, "debug_mask")) + ublk_dbg_mask = strtol(optarg, NULL, 16); + if (!strcmp(longopts[option_idx].name, "quiet")) + ublk_dbg_mask = 0; + if (!strcmp(longopts[option_idx].name, "foreground")) + ctx.fg = 1; + if (!strcmp(longopts[option_idx].name, "chunk_size")) + ctx.chunk_size = strtol(optarg, NULL, 10); + } + } + + i = optind; + while (i < argc && ctx.nr_files < MAX_BACK_FILES) { + ctx.files[ctx.nr_files++] = argv[i++]; + } + + if (!strcmp(cmd, "add")) + ret = cmd_dev_add(&ctx); + else if (!strcmp(cmd, "del")) + ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "list")) { + ctx.all = 1; + ret = cmd_dev_list(&ctx); + } else if (!strcmp(cmd, "help")) + ret = cmd_dev_help(argv[0]); + else if (!strcmp(cmd, "features")) + ret = cmd_dev_get_features(); + else + cmd_dev_help(argv[0]); + + return ret; +} diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h new file mode 100644 index 000000000000..f31a5c4d4143 --- /dev/null +++ b/tools/testing/selftests/ublk/kublk.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef KUBLK_INTERNAL_H +#define KUBLK_INTERNAL_H + +#include <unistd.h> +#include <stdlib.h> +#include <assert.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <pthread.h> +#include <getopt.h> +#include <limits.h> +#include <poll.h> +#include <fcntl.h> +#include <sys/syscall.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/inotify.h> +#include <sys/wait.h> +#include <sys/eventfd.h> +#include <sys/uio.h> +#include <liburing.h> +#include <linux/ublk_cmd.h> +#include "ublk_dep.h" + +#define __maybe_unused __attribute__((unused)) +#define MAX_BACK_FILES 4 +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +/****************** part 1: libublk ********************/ + +#define CTRL_DEV "/dev/ublk-control" +#define UBLKC_DEV "/dev/ublkc" +#define UBLKB_DEV "/dev/ublkb" +#define UBLK_CTRL_RING_DEPTH 32 +#define ERROR_EVTFD_DEVID -2 + +/* queue idle timeout */ +#define UBLKSRV_IO_IDLE_SECS 20 + +#define UBLK_IO_MAX_BYTES (1 << 20) +#define UBLK_MAX_QUEUES 4 +#define UBLK_QUEUE_DEPTH 128 + +#define UBLK_DBG_DEV (1U << 0) +#define UBLK_DBG_QUEUE (1U << 1) +#define UBLK_DBG_IO_CMD (1U << 2) +#define UBLK_DBG_IO (1U << 3) +#define UBLK_DBG_CTRL_CMD (1U << 4) +#define UBLK_LOG (1U << 5) + +struct ublk_dev; +struct ublk_queue; + +struct dev_ctx { + char tgt_type[16]; + unsigned long flags; + unsigned nr_hw_queues; + unsigned queue_depth; + int dev_id; + int nr_files; + char *files[MAX_BACK_FILES]; + unsigned int logging:1; + unsigned int all:1; + unsigned int fg:1; + + /* stripe */ + unsigned int chunk_size; + + int _evtfd; +}; + +struct ublk_ctrl_cmd_data { + __u32 cmd_op; +#define CTRL_CMD_HAS_DATA 1 +#define CTRL_CMD_HAS_BUF 2 + __u32 flags; + + __u64 data[2]; + __u64 addr; + __u32 len; +}; + +struct ublk_io { + char *buf_addr; + +#define UBLKSRV_NEED_FETCH_RQ (1UL << 0) +#define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1) +#define UBLKSRV_IO_FREE (1UL << 2) + unsigned short flags; + unsigned short refs; /* used by target code only */ + + int result; + + unsigned short tgt_ios; + void *private_data; +}; + +struct ublk_tgt_ops { + const char *name; + int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); + void (*deinit_tgt)(struct ublk_dev *); + + int (*queue_io)(struct ublk_queue *, int tag); + void (*tgt_io_done)(struct ublk_queue *, + int tag, const struct io_uring_cqe *); +}; + +struct ublk_tgt { + unsigned long dev_size; + unsigned int sq_depth; + unsigned int cq_depth; + const struct ublk_tgt_ops *ops; + struct ublk_params params; + + int nr_backing_files; + unsigned long backing_file_size[MAX_BACK_FILES]; + char backing_file[MAX_BACK_FILES][PATH_MAX]; +}; + +struct ublk_queue { + int q_id; + int q_depth; + unsigned int cmd_inflight; + unsigned int io_inflight; + struct ublk_dev *dev; + const struct ublk_tgt_ops *tgt_ops; + char *io_cmd_buf; + struct io_uring ring; + struct ublk_io ios[UBLK_QUEUE_DEPTH]; +#define UBLKSRV_QUEUE_STOPPING (1U << 0) +#define UBLKSRV_QUEUE_IDLE (1U << 1) +#define UBLKSRV_NO_BUF (1U << 2) +#define UBLKSRV_ZC (1U << 3) + unsigned state; + pid_t tid; + pthread_t thread; +}; + +struct ublk_dev { + struct ublk_tgt tgt; + struct ublksrv_ctrl_dev_info dev_info; + struct ublk_queue q[UBLK_MAX_QUEUES]; + + int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ + int nr_fds; + int ctrl_fd; + struct io_uring ring; + + void *private_data; +}; + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + unsigned long __mptr = (unsigned long)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); }) +#endif + +#define round_up(val, rnd) \ + (((val) + ((rnd) - 1)) & ~((rnd) - 1)) + + +extern unsigned int ublk_dbg_mask; +extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag); + +static inline int is_target_io(__u64 user_data) +{ + return (user_data & (1ULL << 63)) != 0; +} + +static inline __u64 build_user_data(unsigned tag, unsigned op, + unsigned tgt_data, unsigned is_target_io) +{ + assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16)); + + return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63; +} + +static inline unsigned int user_data_to_tag(__u64 user_data) +{ + return user_data & 0xffff; +} + +static inline unsigned int user_data_to_op(__u64 user_data) +{ + return (user_data >> 16) & 0xff; +} + +static inline unsigned int user_data_to_tgt_data(__u64 user_data) +{ + return (user_data >> 24) & 0xffff; +} + +static inline unsigned short ublk_cmd_op_nr(unsigned int op) +{ + return _IOC_NR(op); +} + +static inline void ublk_err(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); +} + +static inline void ublk_log(const char *fmt, ...) +{ + if (ublk_dbg_mask & UBLK_LOG) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline void ublk_dbg(int level, const char *fmt, ...) +{ + if (level & ublk_dbg_mask) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline int ublk_queue_alloc_sqes(struct ublk_queue *q, + struct io_uring_sqe *sqes[], int nr_sqes) +{ + unsigned left = io_uring_sq_space_left(&q->ring); + int i; + + if (left < nr_sqes) + io_uring_submit(&q->ring); + + for (i = 0; i < nr_sqes; i++) { + sqes[i] = io_uring_get_sqe(&q->ring); + if (!sqes[i]) + return i; + } + + return nr_sqes; +} + +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + +static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return (void *)&sqe->cmd; +} + +static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res) +{ + q->ios[tag].result = res; +} + +static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) +{ + return q->ios[tag].result; +} + +static inline void ublk_mark_io_done(struct ublk_io *io, int res) +{ + io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); + io->result = res; +} + +static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) +{ + return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); +} + +static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) +{ + __u32 *addr = (__u32 *)&sqe->off; + + addr[0] = cmd_op; + addr[1] = 0; +} + +static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) +{ + return &q->ios[tag]; +} + +static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) +{ + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + + return ublk_queue_io_cmd(q, io, tag); +} + +static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + q->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + +static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag) +{ + struct ublk_io *io = ublk_get_io(q, tag); + + q->io_inflight--; + + return --io->tgt_ios == 0; +} + +static inline int ublk_queue_use_zc(const struct ublk_queue *q) +{ + return q->state & UBLKSRV_ZC; +} + +extern const struct ublk_tgt_ops null_tgt_ops; +extern const struct ublk_tgt_ops loop_tgt_ops; +extern const struct ublk_tgt_ops stripe_tgt_ops; + +void backing_file_tgt_deinit(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev); + +static inline unsigned int ilog2(unsigned int x) +{ + if (x == 0) + return 0; + return (sizeof(x) * 8 - 1) - __builtin_clz(x); +} +#endif diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c new file mode 100644 index 000000000000..899875ff50fe --- /dev/null +++ b/tools/testing/selftests/ublk/null.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "kublk.h" + +#ifndef IORING_NOP_INJECT_RESULT +#define IORING_NOP_INJECT_RESULT (1U << 0) +#endif + +#ifndef IORING_NOP_FIXED_BUFFER +#define IORING_NOP_FIXED_BUFFER (1U << 3) +#endif + +static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + unsigned long dev_size = 250UL << 30; + + dev->tgt.dev_size = dev_size; + dev->tgt.params = (struct ublk_params) { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = info->max_io_buf_bytes >> 9, + .dev_sectors = dev_size >> 9, + }, + }; + + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) + dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; + return 0; +} + +static int null_queue_zc_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + struct io_uring_sqe *sqe[3]; + + ublk_queue_alloc_sqes(q, sqe, 3); + + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + + io_uring_prep_nop(sqe[1]); + sqe[1]->buf_index = tag; + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; + sqe[1]->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; + sqe[1]->len = iod->nr_sectors << 9; /* injected result */ + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); + + // buf register is marked as IOSQE_CQE_SKIP_SUCCESS + return 2; +} + +static void ublk_null_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + if (!io->result) + io->result = cqe->res; + if (cqe->res < 0) + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } + + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + + if (ublk_completed_tgt_io(q, tag)) + ublk_complete_io(q, tag, io->result); +} + +static int ublk_null_queue_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + int zc = ublk_queue_use_zc(q); + int queued; + + if (!zc) { + ublk_complete_io(q, tag, iod->nr_sectors << 9); + return 0; + } + + queued = null_queue_zc_io(q, tag); + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +const struct ublk_tgt_ops null_tgt_ops = { + .name = "null", + .init_tgt = ublk_null_tgt_init, + .queue_io = ublk_null_queue_io, + .tgt_io_done = ublk_null_io_done, +}; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c new file mode 100644 index 000000000000..98c564b12f3c --- /dev/null +++ b/tools/testing/selftests/ublk/stripe.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +#define NR_STRIPE MAX_BACK_FILES + +struct stripe_conf { + unsigned nr_files; + unsigned shift; +}; + +struct stripe { + loff_t start; + unsigned nr_sects; + int seq; + + struct iovec *vec; + unsigned nr_vec; + unsigned cap; +}; + +struct stripe_array { + struct stripe s[NR_STRIPE]; + unsigned nr; + struct iovec _vec[]; +}; + +static inline const struct stripe_conf *get_chunk_shift(const struct ublk_queue *q) +{ + return (struct stripe_conf *)q->dev->private_data; +} + +static inline unsigned calculate_nr_vec(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod) +{ + const unsigned shift = conf->shift - 9; + const unsigned unit_sects = conf->nr_files << shift; + loff_t start = iod->start_sector; + loff_t end = start + iod->nr_sectors; + + return (end / unit_sects) - (start / unit_sects) + 1; +} + +static struct stripe_array *alloc_stripe_array(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod) +{ + unsigned nr_vecs = calculate_nr_vec(conf, iod); + unsigned total = nr_vecs * conf->nr_files; + struct stripe_array *s; + int i; + + s = malloc(sizeof(*s) + total * sizeof(struct iovec)); + + s->nr = 0; + for (i = 0; i < conf->nr_files; i++) { + struct stripe *t = &s->s[i]; + + t->nr_vec = 0; + t->vec = &s->_vec[i * nr_vecs]; + t->nr_sects = 0; + t->cap = nr_vecs; + } + + return s; +} + +static void free_stripe_array(struct stripe_array *s) +{ + free(s); +} + +static void calculate_stripe_array(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod, struct stripe_array *s) +{ + const unsigned shift = conf->shift - 9; + const unsigned chunk_sects = 1 << shift; + const unsigned unit_sects = conf->nr_files << shift; + off64_t start = iod->start_sector; + off64_t end = start + iod->nr_sectors; + unsigned long done = 0; + unsigned idx = 0; + + while (start < end) { + unsigned nr_sects = chunk_sects - (start & (chunk_sects - 1)); + loff_t unit_off = (start / unit_sects) * unit_sects; + unsigned seq = (start - unit_off) >> shift; + struct stripe *this = &s->s[idx]; + loff_t stripe_off = (unit_off / conf->nr_files) + + (start & (chunk_sects - 1)); + + if (nr_sects > end - start) + nr_sects = end - start; + if (this->nr_sects == 0) { + this->nr_sects = nr_sects; + this->start = stripe_off; + this->seq = seq; + s->nr += 1; + } else { + assert(seq == this->seq); + assert(this->start + this->nr_sects == stripe_off); + this->nr_sects += nr_sects; + } + + assert(this->nr_vec < this->cap); + this->vec[this->nr_vec].iov_base = (void *)(iod->addr + done); + this->vec[this->nr_vec++].iov_len = nr_sects << 9; + + start += nr_sects; + done += nr_sects << 9; + idx = (idx + 1) % conf->nr_files; + } +} + +static inline enum io_uring_op stripe_to_uring_op(const struct ublksrv_io_desc *iod) +{ + unsigned ublk_op = ublksrv_get_op(iod); + + if (ublk_op == UBLK_IO_OP_READ) + return IORING_OP_READV; + else if (ublk_op == UBLK_IO_OP_WRITE) + return IORING_OP_WRITEV; + assert(0); +} + +static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + const struct stripe_conf *conf = get_chunk_shift(q); + enum io_uring_op op = stripe_to_uring_op(iod); + struct io_uring_sqe *sqe[NR_STRIPE]; + struct stripe_array *s = alloc_stripe_array(conf, iod); + struct ublk_io *io = ublk_get_io(q, tag); + int i; + + io->private_data = s; + calculate_stripe_array(conf, iod, s); + + ublk_queue_alloc_sqes(q, sqe, s->nr); + for (i = 0; i < s->nr; i++) { + struct stripe *t = &s->s[i]; + + io_uring_prep_rw(op, sqe[i], + t->seq + 1, + (void *)t->vec, + t->nr_vec, + t->start << 9); + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i, 1); + } + return s->nr; +} + +static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + const struct stripe_conf *conf = get_chunk_shift(q); + struct io_uring_sqe *sqe[NR_STRIPE]; + int i; + + ublk_queue_alloc_sqes(q, sqe, conf->nr_files); + for (i = 0; i < conf->nr_files; i++) { + io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1); + } + return conf->nr_files; +} + +static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + int ret = 0; + + switch (ublk_op) { + case UBLK_IO_OP_FLUSH: + ret = handle_flush(q, iod, tag); + break; + case UBLK_IO_OP_WRITE_ZEROES: + case UBLK_IO_OP_DISCARD: + ret = -ENOTSUP; + break; + case UBLK_IO_OP_READ: + case UBLK_IO_OP_WRITE: + ret = stripe_queue_tgt_rw_io(q, iod, tag); + break; + default: + ret = -EINVAL; + break; + } + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u ret %d\n", __func__, tag, + iod->op_flags, iod->start_sector, iod->nr_sectors << 9, ret); + return ret; +} + +static int ublk_stripe_queue_io(struct ublk_queue *q, int tag) +{ + int queued = stripe_queue_tgt_io(q, tag); + + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +static void ublk_stripe_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + int res = cqe->res; + + if (res < 0) { + if (!io->result) + io->result = res; + ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); + } + + /* fail short READ/WRITE simply */ + if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) { + unsigned seq = user_data_to_tgt_data(cqe->user_data); + struct stripe_array *s = io->private_data; + + if (res < s->s[seq].vec->iov_len) + io->result = -EIO; + } + + if (ublk_completed_tgt_io(q, tag)) { + int res = io->result; + + if (!res) + res = iod->nr_sectors << 9; + + ublk_complete_io(q, tag, res); + + free_stripe_array(io->private_data); + io->private_data = NULL; + } +} + +static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .attrs = UBLK_ATTR_VOLATILE_CACHE, + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + }; + unsigned chunk_size = ctx->chunk_size; + struct stripe_conf *conf; + unsigned chunk_shift; + loff_t bytes = 0; + int ret, i; + + if ((chunk_size & (chunk_size - 1)) || !chunk_size) { + ublk_err("invalid chunk size %u\n", chunk_size); + return -EINVAL; + } + + if (chunk_size < 4096 || chunk_size > 512 * 1024) { + ublk_err("invalid chunk size %u\n", chunk_size); + return -EINVAL; + } + + chunk_shift = ilog2(chunk_size); + + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) + return -EINVAL; + + assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) + dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + unsigned long size = dev->tgt.backing_file_size[i]; + + if (size != dev->tgt.backing_file_size[0]) + return -EINVAL; + bytes += size; + } + + conf = malloc(sizeof(*conf)); + conf->shift = chunk_shift; + conf->nr_files = dev->tgt.nr_backing_files; + + dev->private_data = conf; + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + dev->tgt.sq_depth = dev->dev_info.queue_depth * conf->nr_files; + dev->tgt.cq_depth = dev->dev_info.queue_depth * conf->nr_files; + + printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files); + + return 0; +} + +static void ublk_stripe_tgt_deinit(struct ublk_dev *dev) +{ + free(dev->private_data); + backing_file_tgt_deinit(dev); +} + +const struct ublk_tgt_ops stripe_tgt_ops = { + .name = "stripe", + .init_tgt = ublk_stripe_tgt_init, + .deinit_tgt = ublk_stripe_tgt_deinit, + .queue_io = ublk_stripe_queue_io, + .tgt_io_done = ublk_stripe_io_done, +}; diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh new file mode 100755 index 000000000000..75f54ac6b1c4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_common.sh @@ -0,0 +1,246 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +UBLK_SKIP_CODE=4 + +_have_program() { + if command -v "$1" >/dev/null 2>&1; then + return 0 + fi + return 1 +} + +_get_disk_dev_t() { + local dev_id=$1 + local dev + local major + local minor + + dev=/dev/ublkb"${dev_id}" + major=$(stat -c '%Hr' "$dev") + minor=$(stat -c '%Lr' "$dev") + + echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) +} + +_create_backfile() { + local my_size=$1 + local my_file + + my_file=$(mktemp ublk_file_"${my_size}"_XXXXX) + truncate -s "${my_size}" "${my_file}" + echo "$my_file" +} + +_remove_backfile() { + local file=$1 + + [ -f "$file" ] && rm -f "$file" +} + +_create_tmp_dir() { + local my_file; + + my_file=$(mktemp -d ublk_dir_XXXXX) + echo "$my_file" +} + +_remove_tmp_dir() { + local dir=$1 + + [ -d "$dir" ] && rmdir "$dir" +} + +_mkfs_mount_test() +{ + local dev=$1 + local err_code=0 + local mnt_dir; + + mnt_dir=$(_create_tmp_dir) + mkfs.ext4 -F "$dev" > /dev/null 2>&1 + err_code=$? + if [ $err_code -ne 0 ]; then + return $err_code + fi + + mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 + umount "$dev" + err_code=$? + _remove_tmp_dir "$mnt_dir" + if [ $err_code -ne 0 ]; then + return $err_code + fi +} + +_check_root() { + local ksft_skip=4 + + if [ $UID != 0 ]; then + echo please run this as root >&2 + exit $ksft_skip + fi +} + +_remove_ublk_devices() { + ${UBLK_PROG} del -a + modprobe -r ublk_drv > /dev/null 2>&1 +} + +_get_ublk_dev_state() { + ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' +} + +_get_ublk_daemon_pid() { + ${UBLK_PROG} list -n "$1" | grep "pid" | awk '{print $7}' +} + +_prep_test() { + _check_root + local type=$1 + shift 1 + modprobe ublk_drv > /dev/null 2>&1 + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" +} + +_remove_test_files() +{ + local files=$* + + for file in ${files}; do + [ -f "${file}" ] && rm -f "${file}" + done +} + +_show_result() +{ + if [ "$UBLK_TEST_SHOW_RESULT" -ne 0 ]; then + if [ "$2" -eq 0 ]; then + echo "$1 : [PASS]" + elif [ "$2" -eq 4 ]; then + echo "$1 : [SKIP]" + else + echo "$1 : [FAIL]" + fi + fi + [ "$2" -ne 0 ] && exit "$2" + return 0 +} + +# don't call from sub-shell, otherwise can't exit +_check_add_dev() +{ + local tid=$1 + local code=$2 + shift 2 + if [ "${code}" -ne 0 ]; then + _remove_test_files "$@" + _show_result "${tid}" "${code}" + fi +} + +_cleanup_test() { + "${UBLK_PROG}" del -a + rm -f "$UBLK_TMP" +} + +_have_feature() +{ + if $UBLK_PROG "features" | grep "$1" > /dev/null 2>&1; then + return 0 + fi + return 1 +} + +_add_ublk_dev() { + local kublk_temp; + local dev_id; + + if [ ! -c /dev/ublk-control ]; then + return ${UBLK_SKIP_CODE} + fi + if echo "$@" | grep -q "\-z"; then + if ! _have_feature "ZERO_COPY"; then + return ${UBLK_SKIP_CODE} + fi + fi + + kublk_temp=$(mktemp /tmp/kublk-XXXXXX) + if ! "${UBLK_PROG}" add "$@" > "${kublk_temp}" 2>&1; then + echo "fail to add ublk dev $*" + rm -f "${kublk_temp}" + return 255 + fi + + dev_id=$(grep "dev id" "${kublk_temp}" | awk -F '[ :]' '{print $3}') + udevadm settle + rm -f "${kublk_temp}" + echo "${dev_id}" +} + +# kill the ublk daemon and return ublk device state +__ublk_kill_daemon() +{ + local dev_id=$1 + local exp_state=$2 + local daemon_pid + local state + + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") + state=$(_get_ublk_dev_state "${dev_id}") + + for ((j=0;j<50;j++)); do + [ "$state" == "$exp_state" ] && break + kill -9 "$daemon_pid" > /dev/null 2>&1 + sleep 1 + state=$(_get_ublk_dev_state "${dev_id}") + done + echo "$state" +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + ${UBLK_PROG} del -n "${dev_id}" + local res=$? + udevadm settle + return ${res} +} + +__run_io_and_remove() +{ + local dev_id=$1 + local size=$2 + local kill_server=$3 + + fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \ + --rw=readwrite --iodepth=64 --size="${size}" --numjobs=4 \ + --runtime=20 --time_based > /dev/null 2>&1 & + sleep 2 + if [ "${kill_server}" = "yes" ]; then + local state + state=$(__ublk_kill_daemon "${dev_id}" "DEAD") + if [ "$state" != "DEAD" ]; then + echo "device isn't dead($state) after killing daemon" + return 255 + fi + fi + if ! __remove_ublk_dev_return "${dev_id}"; then + echo "delete dev ${dev_id} failed" + return 255 + fi + wait +} + +_ublk_test_top_dir() +{ + cd "$(dirname "$0")" && pwd +} + +UBLK_TMP=$(mktemp ublk_test_XXXXX) +UBLK_PROG=$(_ublk_test_top_dir)/kublk +UBLK_TEST_QUIET=1 +UBLK_TEST_SHOW_RESULT=1 +export UBLK_PROG +export UBLK_TEST_QUIET +export UBLK_TEST_SHOW_RESULT diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh new file mode 100755 index 000000000000..9227a208ba53 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_01" +ERR_CODE=0 + +if ! _have_program bpftrace; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "null" "sequential io order" + +dev_id=$(_add_ublk_dev -t null) +_check_add_dev $TID $? + +dev_t=$(_get_disk_dev_t "$dev_id") +bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & +btrace_pid=$! +sleep 2 + +if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# run fio over this ublk disk +fio --name=write_seq \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? +kill "$btrace_pid" +wait +if grep -q "io_out_of_order" "$UBLK_TMP"; then + cat "$UBLK_TMP" + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh new file mode 100755 index 000000000000..c882d2a08e13 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_01" +ERR_CODE=0 + +_prep_test "loop" "write and verify test" + +backfile_0=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -t loop "$backfile_0") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh new file mode 100755 index 000000000000..03863d825e07 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_02" +ERR_CODE=0 + +_prep_test "loop" "mkfs & mount & umount" + +backfile_0=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t loop "$backfile_0") +_check_add_dev $TID $? "$backfile_0" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh new file mode 100755 index 000000000000..269c96787d7d --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_03" +ERR_CODE=0 + +_prep_test "loop" "write and verify over zero copy" + +backfile_0=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") +_check_add_dev $TID $? "$backfile_0" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=64 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh new file mode 100755 index 000000000000..1435422c38ec --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_04" +ERR_CODE=0 + +_prep_test "loop" "mkfs & mount & umount with zero copy" + +backfile_0=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") +_check_add_dev $TID $? "$backfile_0" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh new file mode 100755 index 000000000000..a34203f72668 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="null_01" +ERR_CODE=0 + +_prep_test "null" "basic IO test" + +dev_id=$(_add_ublk_dev -t null) +_check_add_dev $TID $? + +# run fio over the two disks +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "null" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh new file mode 100755 index 000000000000..5633ca876655 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="null_02" +ERR_CODE=0 + +_prep_test "null" "basic IO test with zero copy" + +dev_id=$(_add_ublk_dev -t null -z) +_check_add_dev $TID $? + +# run fio over the two disks +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "null" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh new file mode 100755 index 000000000000..7177f6c57bc5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_01" +ERR_CODE=0 +DEV_ID=-1 + +ublk_io_and_remove() +{ + local size=$1 + shift 1 + local backfile="" + if echo "$@" | grep -q "loop"; then + backfile=${*: -1} + fi + DEV_ID=$(_add_ublk_dev "$@") + _check_add_dev $TID $? "${backfile}" + + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)" + if ! __run_io_and_remove "${DEV_ID}" "${size}" "no"; then + echo "/dev/ublkc${DEV_ID} isn't removed" + _remove_backfile "${backfile}" + exit 255 + fi +} + +_prep_test "stress" "run IO and remove device" + +ublk_io_and_remove 8G -t null +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +BACK_FILE=$(_create_backfile 256M) +ublk_io_and_remove 256M -t loop "${BACK_FILE}" +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +ublk_io_and_remove 256M -t loop -z "${BACK_FILE}" +ERR_CODE=$? +_cleanup_test "stress" +_remove_backfile "${BACK_FILE}" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh new file mode 100755 index 000000000000..2a8e60579a06 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_02" +ERR_CODE=0 +DEV_ID=-1 + +ublk_io_and_kill_daemon() +{ + local size=$1 + shift 1 + local backfile="" + if echo "$@" | grep -q "loop"; then + backfile=${*: -1} + fi + DEV_ID=$(_add_ublk_dev "$@") + _check_add_dev $TID $? "${backfile}" + + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs kill ublk server(ublk add $*)" + if ! __run_io_and_remove "${DEV_ID}" "${size}" "yes"; then + echo "/dev/ublkc${DEV_ID} isn't removed res ${res}" + _remove_backfile "${backfile}" + exit 255 + fi +} + +_prep_test "stress" "run IO and kill ublk server" + +ublk_io_and_kill_daemon 8G -t null +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +BACK_FILE=$(_create_backfile 256M) +ublk_io_and_kill_daemon 256M -t loop "${BACK_FILE}" +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +ublk_io_and_kill_daemon 256M -t loop -z "${BACK_FILE}" +ERR_CODE=$? +_cleanup_test "stress" +_remove_backfile "${BACK_FILE}" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh new file mode 100755 index 000000000000..c01f3dc325ab --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_01" +ERR_CODE=0 + +_prep_test "stripe" "write and verify test" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=32 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh new file mode 100755 index 000000000000..e8a45fa82dde --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_02" +ERR_CODE=0 + +_prep_test "stripe" "mkfs & mount & umount" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "$backfile_0" "$backfile_1" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt new file mode 100644 index 000000000000..272ac54c9d5f --- /dev/null +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -0,0 +1,25 @@ +/* + $1: dev_t + $2: RWBS + $3: strlen($2) +*/ +BEGIN { + @last_rw[$1, str($2)] = 0; +} +tracepoint:block:block_rq_complete +{ + $dev = $1; + if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { + $last = @last_rw[$dev, str($2)]; + if ((uint64)args.sector != $last) { + printf("io_out_of_order: exp %llu actual %llu\n", + args.sector, $last); + } + @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + } + @ios = count(); +} + +END { + clear(@last_rw); +} diff --git a/tools/testing/selftests/ublk/ublk_dep.h b/tools/testing/selftests/ublk/ublk_dep.h new file mode 100644 index 000000000000..f68fa7eac939 --- /dev/null +++ b/tools/testing/selftests/ublk/ublk_dep.h @@ -0,0 +1,18 @@ +#ifndef UBLK_DEP_H +#define UBLK_DEP_H + +#ifndef UBLK_U_IO_REGISTER_IO_BUF +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) +#endif + +#ifndef UBLK_F_USER_RECOVERY_FAIL_IO +#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) +#endif + +#ifndef UBLK_F_ZONED +#define UBLK_F_ZONED (1ULL << 8) +#endif +#endif diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile index 1cf14a8da438..12a0614b9fd4 100644 --- a/tools/testing/selftests/vDSO/Makefile +++ b/tools/testing/selftests/vDSO/Makefile @@ -19,13 +19,20 @@ LDLIBS += -lgcc_s endif include ../lib.mk + +CFLAGS += $(TOOLS_INCLUDES) + +CFLAGS_NOLIBC := -nostdlib -nostdinc -ffreestanding -fno-asynchronous-unwind-tables \ + -fno-stack-protector -include $(top_srcdir)/tools/include/nolibc/nolibc.h \ + -I$(top_srcdir)/tools/include/nolibc/ $(KHDR_INCLUDES) + $(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c $(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c $(OUTPUT)/vdso_test_abi: parse_vdso.c vdso_test_abi.c $(OUTPUT)/vdso_test_clock_getres: vdso_test_clock_getres.c -$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c -$(OUTPUT)/vdso_standalone_test_x86: CFLAGS +=-nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector +$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c | headers +$(OUTPUT)/vdso_standalone_test_x86: CFLAGS:=$(CFLAGS_NOLIBC) $(CFLAGS) $(OUTPUT)/vdso_test_correctness: vdso_test_correctness.c $(OUTPUT)/vdso_test_correctness: LDFLAGS += -ldl diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index f89d052c730e..3ff00fb624a4 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -19,13 +19,14 @@ #include <stdint.h> #include <string.h> #include <limits.h> -#include <elf.h> +#include <linux/auxvec.h> +#include <linux/elf.h> #include "parse_vdso.h" /* And here's the code. */ #ifndef ELF_BITS -# if ULONG_MAX > 0xffffffffUL +# if __SIZEOF_LONG__ >= 8 # define ELF_BITS 64 # else # define ELF_BITS 32 @@ -297,17 +298,3 @@ void *vdso_sym(const char *version, const char *name) return 0; } - -void vdso_init_from_auxv(void *auxv) -{ - ELF(auxv_t) *elf_auxv = auxv; - for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++) - { - if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) { - vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val); - return; - } - } - - vdso_info.valid = false; -} diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h index de0453067d7c..09d068ed11f9 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.h +++ b/tools/testing/selftests/vDSO/parse_vdso.h @@ -26,6 +26,5 @@ */ void *vdso_sym(const char *version, const char *name); void vdso_init_from_sysinfo_ehdr(uintptr_t base); -void vdso_init_from_auxv(void *auxv); #endif diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c index 644915862af8..9ce795b806f0 100644 --- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c +++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c @@ -1,142 +1,58 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vdso_test.c: Sample code to test parse_vdso.c on x86 - * Copyright (c) 2011-2014 Andy Lutomirski + * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and + * vDSO gettimeofday() + * Copyright (c) 2014 Andy Lutomirski * - * You can amuse yourself by compiling with: - * gcc -std=gnu99 -nostdlib - * -Os -fno-asynchronous-unwind-tables -flto -lgcc_s - * vdso_standalone_test_x86.c parse_vdso.c - * to generate a small binary. On x86_64, you can omit -lgcc_s - * if you want the binary to be completely standalone. + * Compile with: + * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c + * + * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ -#include <sys/syscall.h> +#include <stdio.h> +#ifndef NOLIBC +#include <sys/auxv.h> #include <sys/time.h> -#include <unistd.h> -#include <stdint.h> - -#include "parse_vdso.h" - -/* We need some libc functions... */ -int strcmp(const char *a, const char *b) -{ - /* This implementation is buggy: it never returns -1. */ - while (*a || *b) { - if (*a != *b) - return 1; - if (*a == 0 || *b == 0) - return 1; - a++; - b++; - } - - return 0; -} - -/* - * The clang build needs this, although gcc does not. - * Stolen from lib/string.c. - */ -void *memcpy(void *dest, const void *src, size_t count) -{ - char *tmp = dest; - const char *s = src; - - while (count--) - *tmp++ = *s++; - return dest; -} - -/* ...and two syscalls. This is x86-specific. */ -static inline long x86_syscall3(long nr, long a0, long a1, long a2) -{ - long ret; -#ifdef __x86_64__ - asm volatile ("syscall" : "=a" (ret) : "a" (nr), - "D" (a0), "S" (a1), "d" (a2) : - "cc", "memory", "rcx", - "r8", "r9", "r10", "r11" ); -#else - asm volatile ("int $0x80" : "=a" (ret) : "a" (nr), - "b" (a0), "c" (a1), "d" (a2) : - "cc", "memory" ); #endif - return ret; -} -static inline long linux_write(int fd, const void *data, size_t len) -{ - return x86_syscall3(__NR_write, fd, (long)data, (long)len); -} +#include "../kselftest.h" +#include "parse_vdso.h" +#include "vdso_config.h" +#include "vdso_call.h" -static inline void linux_exit(int code) +int main(int argc, char **argv) { - x86_syscall3(__NR_exit, code, 0, 0); -} + const char *version = versions[VDSO_VERSION]; + const char **name = (const char **)&names[VDSO_NAMES]; -void to_base10(char *lastdig, time_t n) -{ - while (n) { - *lastdig = (n % 10) + '0'; - n /= 10; - lastdig--; + unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR); + if (!sysinfo_ehdr) { + printf("AT_SYSINFO_EHDR is not present!\n"); + return KSFT_SKIP; } -} - -void c_main(void **stack) -{ - /* Parse the stack */ - long argc = (long)*stack; - stack += argc + 2; - - /* Now we're pointing at the environment. Skip it. */ - while(*stack) - stack++; - stack++; - /* Now we're pointing at auxv. Initialize the vDSO parser. */ - vdso_init_from_auxv((void *)stack); + vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR)); /* Find gettimeofday. */ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); - gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); + gtod_t gtod = (gtod_t)vdso_sym(version, name[0]); - if (!gtod) - linux_exit(1); + if (!gtod) { + printf("Could not find %s\n", name[0]); + return KSFT_SKIP; + } struct timeval tv; - long ret = gtod(&tv, 0); + long ret = VDSO_CALL(gtod, 2, &tv, 0); if (ret == 0) { - char buf[] = "The time is .000000\n"; - to_base10(buf + 31, tv.tv_sec); - to_base10(buf + 38, tv.tv_usec); - linux_write(1, buf, sizeof(buf) - 1); + printf("The time is %lld.%06lld\n", + (long long)tv.tv_sec, (long long)tv.tv_usec); } else { - linux_exit(ret); + printf("%s failed\n", name[0]); + return KSFT_FAIL; } - linux_exit(0); + return 0; } - -/* - * This is the real entry point. It passes the initial stack into - * the C entry point. - */ -asm ( - ".text\n" - ".global _start\n" - ".type _start,@function\n" - "_start:\n\t" -#ifdef __x86_64__ - "mov %rsp,%rdi\n\t" - "and $-16,%rsp\n\t" - "sub $8,%rsp\n\t" - "jmp c_main" -#else - "push %esp\n\t" - "call c_main\n\t" - "int $3" -#endif - ); diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c index e31b18ffae33..9ce795b806f0 100644 --- a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c +++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c @@ -10,11 +10,11 @@ * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too. */ -#include <stdint.h> -#include <elf.h> #include <stdio.h> +#ifndef NOLIBC #include <sys/auxv.h> #include <sys/time.h> +#endif #include "../kselftest.h" #include "parse_vdso.h" diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 139fd9aa8b12..c305d2f613f0 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -27,7 +27,6 @@ CONFIG_DEBUG_KMEMLEAK=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_SHIRQ=y CONFIG_WQ_WATCHDOG=y -CONFIG_SCHED_DEBUG=y CONFIG_SCHED_INFO=y CONFIG_SCHEDSTATS=y CONFIG_SCHED_STACK_END_CHECK=y diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index d51249f14e2f..28422c32cc8f 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -19,7 +19,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx lam test_shadow_stack + corrupt_xstate_header amx lam test_shadow_stack avx # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall @@ -132,3 +132,7 @@ $(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static $(OUTPUT)/nx_stack_32: CFLAGS += -Wl,-z,noexecstack $(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack + +$(OUTPUT)/avx_64: CFLAGS += -mno-avx -mno-avx512f +$(OUTPUT)/amx_64: EXTRA_FILES += xstate.c +$(OUTPUT)/avx_64: EXTRA_FILES += xstate.c diff --git a/tools/testing/selftests/x86/amx.c b/tools/testing/selftests/x86/amx.c index 1fdf35a4d7f6..40769c16de1b 100644 --- a/tools/testing/selftests/x86/amx.c +++ b/tools/testing/selftests/x86/amx.c @@ -3,7 +3,6 @@ #define _GNU_SOURCE #include <err.h> #include <errno.h> -#include <pthread.h> #include <setjmp.h> #include <stdio.h> #include <string.h> @@ -14,169 +13,27 @@ #include <sys/auxv.h> #include <sys/mman.h> #include <sys/shm.h> -#include <sys/ptrace.h> #include <sys/syscall.h> #include <sys/wait.h> -#include <sys/uio.h> -#include "../kselftest.h" /* For __cpuid_count() */ +#include "helpers.h" +#include "xstate.h" #ifndef __x86_64__ # error This test is 64-bit only #endif -#define XSAVE_HDR_OFFSET 512 -#define XSAVE_HDR_SIZE 64 - -struct xsave_buffer { - union { - struct { - char legacy[XSAVE_HDR_OFFSET]; - char header[XSAVE_HDR_SIZE]; - char extended[0]; - }; - char bytes[0]; - }; -}; - -static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) -{ - uint32_t rfbm_lo = rfbm; - uint32_t rfbm_hi = rfbm >> 32; - - asm volatile("xsave (%%rdi)" - : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) - : "memory"); -} - -static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) -{ - uint32_t rfbm_lo = rfbm; - uint32_t rfbm_hi = rfbm >> 32; - - asm volatile("xrstor (%%rdi)" - : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); -} - /* err() exits and will not return */ #define fatal_error(msg, ...) err(1, "[FAIL]\t" msg, ##__VA_ARGS__) -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - fatal_error("sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - fatal_error("sigaction"); -} - -#define XFEATURE_XTILECFG 17 -#define XFEATURE_XTILEDATA 18 #define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) #define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) #define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) -#define CPUID_LEAF1_ECX_XSAVE_MASK (1 << 26) -#define CPUID_LEAF1_ECX_OSXSAVE_MASK (1 << 27) - -static uint32_t xbuf_size; - -static struct { - uint32_t xbuf_offset; - uint32_t size; -} xtiledata; - -#define CPUID_LEAF_XSTATE 0xd -#define CPUID_SUBLEAF_XSTATE_USER 0x0 -#define TILE_CPUID 0x1d -#define TILE_PALETTE_ID 0x1 - -static void check_cpuid_xtiledata(void) -{ - uint32_t eax, ebx, ecx, edx; - - __cpuid_count(CPUID_LEAF_XSTATE, CPUID_SUBLEAF_XSTATE_USER, - eax, ebx, ecx, edx); - - /* - * EBX enumerates the size (in bytes) required by the XSAVE - * instruction for an XSAVE area containing all the user state - * components corresponding to bits currently set in XCR0. - * - * Stash that off so it can be used to allocate buffers later. - */ - xbuf_size = ebx; - - __cpuid_count(CPUID_LEAF_XSTATE, XFEATURE_XTILEDATA, - eax, ebx, ecx, edx); - /* - * eax: XTILEDATA state component size - * ebx: XTILEDATA state component offset in user buffer - */ - if (!eax || !ebx) - fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", - eax, ebx); - - xtiledata.size = eax; - xtiledata.xbuf_offset = ebx; -} +struct xstate_info xtiledata; /* The helpers for managing XSAVE buffer and tile states: */ -struct xsave_buffer *alloc_xbuf(void) -{ - struct xsave_buffer *xbuf; - - /* XSAVE buffer should be 64B-aligned. */ - xbuf = aligned_alloc(64, xbuf_size); - if (!xbuf) - fatal_error("aligned_alloc()"); - return xbuf; -} - -static inline void clear_xstate_header(struct xsave_buffer *buffer) -{ - memset(&buffer->header, 0, sizeof(buffer->header)); -} - -static inline void set_xstatebv(struct xsave_buffer *buffer, uint64_t bv) -{ - /* XSTATE_BV is at the beginning of the header: */ - *(uint64_t *)(&buffer->header) = bv; -} - -static void set_rand_tiledata(struct xsave_buffer *xbuf) -{ - int *ptr = (int *)&xbuf->bytes[xtiledata.xbuf_offset]; - int data; - int i; - - /* - * Ensure that 'data' is never 0. This ensures that - * the registers are never in their initial configuration - * and thus never tracked as being in the init state. - */ - data = rand() | 1; - - for (i = 0; i < xtiledata.size / sizeof(int); i++, ptr++) - *ptr = data; -} - struct xsave_buffer *stashed_xsave; static void init_stashed_xsave(void) @@ -192,21 +49,6 @@ static void free_stashed_xsave(void) free(stashed_xsave); } -/* See 'struct _fpx_sw_bytes' at sigcontext.h */ -#define SW_BYTES_OFFSET 464 -/* N.B. The struct's field name varies so read from the offset. */ -#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) - -static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *buffer) -{ - return (struct _fpx_sw_bytes *)(buffer + SW_BYTES_OFFSET); -} - -static inline uint64_t get_fpx_sw_bytes_features(void *buffer) -{ - return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); -} - /* Work around printf() being unsafe in signals: */ #define SIGNAL_BUF_LEN 1000 char signal_message_buffer[SIGNAL_BUF_LEN]; @@ -304,17 +146,10 @@ static inline bool load_rand_tiledata(struct xsave_buffer *xbuf) { clear_xstate_header(xbuf); set_xstatebv(xbuf, XFEATURE_MASK_XTILEDATA); - set_rand_tiledata(xbuf); + set_rand_data(&xtiledata, xbuf); return xrstor_safe(xbuf, XFEATURE_MASK_XTILEDATA); } -/* Return XTILEDATA to its initial configuration. */ -static inline void init_xtiledata(void) -{ - clear_xstate_header(stashed_xsave); - xrstor_safe(stashed_xsave, XFEATURE_MASK_XTILEDATA); -} - enum expected_result { FAIL_EXPECTED, SUCCESS_EXPECTED }; /* arch_prctl() and sigaltstack() test */ @@ -587,14 +422,6 @@ static inline bool __validate_tiledata_regs(struct xsave_buffer *xbuf1) return true; } -static inline void validate_tiledata_regs_same(struct xsave_buffer *xbuf) -{ - int ret = __validate_tiledata_regs(xbuf); - - if (ret != 0) - fatal_error("TILEDATA registers changed"); -} - static inline void validate_tiledata_regs_changed(struct xsave_buffer *xbuf) { int ret = __validate_tiledata_regs(xbuf); @@ -651,251 +478,6 @@ static void test_fork(void) _exit(0); } -/* Context switching test */ - -static struct _ctxtswtest_cfg { - unsigned int iterations; - unsigned int num_threads; -} ctxtswtest_config; - -struct futex_info { - pthread_t thread; - int nr; - pthread_mutex_t mutex; - struct futex_info *next; -}; - -static void *check_tiledata(void *info) -{ - struct futex_info *finfo = (struct futex_info *)info; - struct xsave_buffer *xbuf; - int i; - - xbuf = alloc_xbuf(); - if (!xbuf) - fatal_error("unable to allocate XSAVE buffer"); - - /* - * Load random data into 'xbuf' and then restore - * it to the tile registers themselves. - */ - load_rand_tiledata(xbuf); - for (i = 0; i < ctxtswtest_config.iterations; i++) { - pthread_mutex_lock(&finfo->mutex); - - /* - * Ensure the register values have not - * diverged from those recorded in 'xbuf'. - */ - validate_tiledata_regs_same(xbuf); - - /* Load new, random values into xbuf and registers */ - load_rand_tiledata(xbuf); - - /* - * The last thread's last unlock will be for - * thread 0's mutex. However, thread 0 will - * have already exited the loop and the mutex - * will already be unlocked. - * - * Because this is not an ERRORCHECK mutex, - * that inconsistency will be silently ignored. - */ - pthread_mutex_unlock(&finfo->next->mutex); - } - - free(xbuf); - /* - * Return this thread's finfo, which is - * a unique value for this thread. - */ - return finfo; -} - -static int create_threads(int num, struct futex_info *finfo) -{ - int i; - - for (i = 0; i < num; i++) { - int next_nr; - - finfo[i].nr = i; - /* - * Thread 'i' will wait on this mutex to - * be unlocked. Lock it immediately after - * initialization: - */ - pthread_mutex_init(&finfo[i].mutex, NULL); - pthread_mutex_lock(&finfo[i].mutex); - - next_nr = (i + 1) % num; - finfo[i].next = &finfo[next_nr]; - - if (pthread_create(&finfo[i].thread, NULL, check_tiledata, &finfo[i])) - fatal_error("pthread_create()"); - } - return 0; -} - -static void affinitize_cpu0(void) -{ - cpu_set_t cpuset; - - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - - if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) - fatal_error("sched_setaffinity to CPU 0"); -} - -static void test_context_switch(void) -{ - struct futex_info *finfo; - int i; - - /* Affinitize to one CPU to force context switches */ - affinitize_cpu0(); - - req_xtiledata_perm(); - - printf("[RUN]\tCheck tiledata context switches, %d iterations, %d threads.\n", - ctxtswtest_config.iterations, - ctxtswtest_config.num_threads); - - - finfo = malloc(sizeof(*finfo) * ctxtswtest_config.num_threads); - if (!finfo) - fatal_error("malloc()"); - - create_threads(ctxtswtest_config.num_threads, finfo); - - /* - * This thread wakes up thread 0 - * Thread 0 will wake up 1 - * Thread 1 will wake up 2 - * ... - * the last thread will wake up 0 - * - * ... this will repeat for the configured - * number of iterations. - */ - pthread_mutex_unlock(&finfo[0].mutex); - - /* Wait for all the threads to finish: */ - for (i = 0; i < ctxtswtest_config.num_threads; i++) { - void *thread_retval; - int rc; - - rc = pthread_join(finfo[i].thread, &thread_retval); - - if (rc) - fatal_error("pthread_join() failed for thread %d err: %d\n", - i, rc); - - if (thread_retval != &finfo[i]) - fatal_error("unexpected thread retval for thread %d: %p\n", - i, thread_retval); - - } - - printf("[OK]\tNo incorrect case was found.\n"); - - free(finfo); -} - -/* Ptrace test */ - -/* - * Make sure the ptracee has the expanded kernel buffer on the first - * use. Then, initialize the state before performing the state - * injection from the ptracer. - */ -static inline void ptracee_firstuse_tiledata(void) -{ - load_rand_tiledata(stashed_xsave); - init_xtiledata(); -} - -/* - * Ptracer injects the randomized tile data state. It also reads - * before and after that, which will execute the kernel's state copy - * functions. So, the tester is advised to double-check any emitted - * kernel messages. - */ -static void ptracer_inject_tiledata(pid_t target) -{ - struct xsave_buffer *xbuf; - struct iovec iov; - - xbuf = alloc_xbuf(); - if (!xbuf) - fatal_error("unable to allocate XSAVE buffer"); - - printf("\tRead the init'ed tiledata via ptrace().\n"); - - iov.iov_base = xbuf; - iov.iov_len = xbuf_size; - - memset(stashed_xsave, 0, xbuf_size); - - if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_GETREGSET"); - - if (!__compare_tiledata_state(stashed_xsave, xbuf)) - printf("[OK]\tThe init'ed tiledata was read from ptracee.\n"); - else - printf("[FAIL]\tThe init'ed tiledata was not read from ptracee.\n"); - - printf("\tInject tiledata via ptrace().\n"); - - load_rand_tiledata(xbuf); - - memcpy(&stashed_xsave->bytes[xtiledata.xbuf_offset], - &xbuf->bytes[xtiledata.xbuf_offset], - xtiledata.size); - - if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_SETREGSET"); - - if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) - fatal_error("PTRACE_GETREGSET"); - - if (!__compare_tiledata_state(stashed_xsave, xbuf)) - printf("[OK]\tTiledata was correctly written to ptracee.\n"); - else - printf("[FAIL]\tTiledata was not correctly written to ptracee.\n"); -} - -static void test_ptrace(void) -{ - pid_t child; - int status; - - child = fork(); - if (child < 0) { - err(1, "fork"); - } else if (!child) { - if (ptrace(PTRACE_TRACEME, 0, NULL, NULL)) - err(1, "PTRACE_TRACEME"); - - ptracee_firstuse_tiledata(); - - raise(SIGTRAP); - _exit(0); - } - - do { - wait(&status); - } while (WSTOPSIG(status) != SIGTRAP); - - ptracer_inject_tiledata(child); - - ptrace(PTRACE_DETACH, child, NULL, NULL); - wait(&status); - if (!WIFEXITED(status) || WEXITSTATUS(status)) - err(1, "ptrace test"); -} - int main(void) { unsigned long features; @@ -907,7 +489,11 @@ int main(void) return KSFT_SKIP; } - check_cpuid_xtiledata(); + xtiledata = get_xstate_info(XFEATURE_XTILEDATA); + if (!xtiledata.size || !xtiledata.xbuf_offset) { + fatal_error("xstate cpuid: invalid tile data size/offset: %d/%d", + xtiledata.size, xtiledata.xbuf_offset); + } init_stashed_xsave(); sethandler(SIGILL, handle_noperm, 0); @@ -919,11 +505,11 @@ int main(void) test_fork(); - ctxtswtest_config.iterations = 10; - ctxtswtest_config.num_threads = 5; - test_context_switch(); - - test_ptrace(); + /* + * Perform generic xstate tests for context switching, ptrace, + * and signal. + */ + test_xstate(XFEATURE_XTILEDATA); clearhandler(SIGILL); free_stashed_xsave(); diff --git a/tools/testing/selftests/x86/avx.c b/tools/testing/selftests/x86/avx.c new file mode 100644 index 000000000000..11d5367c235f --- /dev/null +++ b/tools/testing/selftests/x86/avx.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE /* Required for inline xstate helpers */ +#include "xstate.h" + +int main(void) +{ + test_xstate(XFEATURE_YMM); + test_xstate(XFEATURE_OPMASK); + test_xstate(XFEATURE_ZMM_Hi256); + test_xstate(XFEATURE_Hi16_ZMM); +} diff --git a/tools/testing/selftests/x86/corrupt_xstate_header.c b/tools/testing/selftests/x86/corrupt_xstate_header.c index cf9ce8fbb656..93a89a5997ca 100644 --- a/tools/testing/selftests/x86/corrupt_xstate_header.c +++ b/tools/testing/selftests/x86/corrupt_xstate_header.c @@ -18,6 +18,7 @@ #include <sys/wait.h> #include "../kselftest.h" /* For __cpuid_count() */ +#include "helpers.h" static inline int xsave_enabled(void) { @@ -29,19 +30,6 @@ static inline int xsave_enabled(void) return ecx & (1U << 27); } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigusr1(int sig, siginfo_t *info, void *uc_void) { ucontext_t *uc = uc_void; diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index d1e919b0c1dc..5cb8393737d0 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -24,31 +24,11 @@ #include <errno.h> #include <sys/vm86.h> +#include "helpers.h" + static unsigned long load_addr = 0x10000; static int nerrs = 0; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static sig_atomic_t got_signal; static void sighandler(int sig, siginfo_t *info, void *ctx_void) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 50cf32de6313..0a75252d31b6 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -28,6 +28,8 @@ #include <sys/wait.h> #include <setjmp.h> +#include "helpers.h" + #ifndef __x86_64__ # error This test is 64-bit only #endif @@ -39,28 +41,6 @@ static unsigned short *shared_scratch; static int nerrs; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigsegv(int sig, siginfo_t *si, void *ctx_void) { ucontext_t *ctx = (ucontext_t*)ctx_void; diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h index 4ef42c4559a9..6deaad035161 100644 --- a/tools/testing/selftests/x86/helpers.h +++ b/tools/testing/selftests/x86/helpers.h @@ -2,8 +2,13 @@ #ifndef __SELFTESTS_X86_HELPERS_H #define __SELFTESTS_X86_HELPERS_H +#include <signal.h> +#include <string.h> + #include <asm/processor-flags.h> +#include "../kselftest.h" + static inline unsigned long get_eflags(void) { #ifdef __x86_64__ @@ -22,4 +27,27 @@ static inline void set_eflags(unsigned long eflags) #endif } +static inline void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed"); +} + +static inline void clearhandler(int sig) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + ksft_exit_fail_msg("sigaction failed"); +} + #endif /* __SELFTESTS_X86_HELPERS_H */ diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c index 57ec5e99edb9..69d5fb7050c2 100644 --- a/tools/testing/selftests/x86/ioperm.c +++ b/tools/testing/selftests/x86/ioperm.c @@ -20,30 +20,9 @@ #include <sched.h> #include <sys/io.h> -static int nerrs = 0; - -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - -} +#include "helpers.h" -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} +static int nerrs = 0; static jmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/iopl.c b/tools/testing/selftests/x86/iopl.c index 7e3e09c1abac..457b6715542b 100644 --- a/tools/testing/selftests/x86/iopl.c +++ b/tools/testing/selftests/x86/iopl.c @@ -20,30 +20,9 @@ #include <sched.h> #include <sys/io.h> -static int nerrs = 0; - -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - -} +#include "helpers.h" -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} +static int nerrs = 0; static jmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/lam.c b/tools/testing/selftests/x86/lam.c index 4d4a76532dc9..18d736640ece 100644 --- a/tools/testing/selftests/x86/lam.c +++ b/tools/testing/selftests/x86/lam.c @@ -4,6 +4,7 @@ #include <stdlib.h> #include <string.h> #include <sys/syscall.h> +#include <sys/ioctl.h> #include <time.h> #include <signal.h> #include <setjmp.h> @@ -43,7 +44,15 @@ #define FUNC_INHERITE 0x20 #define FUNC_PASID 0x40 +/* get_user() pointer test cases */ +#define GET_USER_USER 0 +#define GET_USER_KERNEL_TOP 1 +#define GET_USER_KERNEL_BOT 2 +#define GET_USER_KERNEL 3 + #define TEST_MASK 0x7f +#define L5_SIGN_EXT_MASK (0xFFUL << 56) +#define L4_SIGN_EXT_MASK (0x1FFFFUL << 47) #define LOW_ADDR (0x1UL << 30) #define HIGH_ADDR (0x3UL << 48) @@ -115,23 +124,42 @@ static void segv_handler(int sig) siglongjmp(segv_env, 1); } -static inline int cpu_has_lam(void) +static inline int lam_is_available(void) { unsigned int cpuinfo[4]; + unsigned long bits = 0; + int ret; __cpuid_count(0x7, 1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); - return (cpuinfo[0] & (1 << 26)); + /* Check if cpu supports LAM */ + if (!(cpuinfo[0] & (1 << 26))) { + ksft_print_msg("LAM is not supported!\n"); + return 0; + } + + /* Return 0 if CONFIG_ADDRESS_MASKING is not set */ + ret = syscall(SYS_arch_prctl, ARCH_GET_MAX_TAG_BITS, &bits); + if (ret) { + ksft_print_msg("LAM is disabled in the kernel!\n"); + return 0; + } + + return 1; } -/* Check 5-level page table feature in CPUID.(EAX=07H, ECX=00H):ECX.[bit 16] */ -static inline int cpu_has_la57(void) +static inline int la57_enabled(void) { - unsigned int cpuinfo[4]; + int ret; + void *p; + + p = mmap((void *)HIGH_ADDR, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); - __cpuid_count(0x7, 0, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); + ret = p == MAP_FAILED ? 0 : 1; - return (cpuinfo[2] & (1 << 16)); + munmap(p, PAGE_SIZE); + return ret; } /* @@ -322,7 +350,7 @@ static int handle_mmap(struct testcases *test) flags, -1, 0); if (ptr == MAP_FAILED) { if (test->addr == HIGH_ADDR) - if (!cpu_has_la57()) + if (!la57_enabled()) return 3; /* unsupport LA57 */ return 1; } @@ -370,6 +398,78 @@ static int handle_syscall(struct testcases *test) return ret; } +static int get_user_syscall(struct testcases *test) +{ + uint64_t ptr_address, bitmask; + int fd, ret = 0; + void *ptr; + + if (la57_enabled()) { + bitmask = L5_SIGN_EXT_MASK; + ptr_address = HIGH_ADDR; + } else { + bitmask = L4_SIGN_EXT_MASK; + ptr_address = LOW_ADDR; + } + + ptr = mmap((void *)ptr_address, PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + + if (ptr == MAP_FAILED) { + perror("failed to map byte to pass into get_user"); + return 1; + } + + if (set_lam(test->lam) != 0) { + ret = 2; + goto error; + } + + fd = memfd_create("lam_ioctl", 0); + if (fd == -1) { + munmap(ptr, PAGE_SIZE); + exit(EXIT_FAILURE); + } + + switch (test->later) { + case GET_USER_USER: + /* Control group - properly tagged user pointer */ + ptr = (void *)set_metadata((uint64_t)ptr, test->lam); + break; + case GET_USER_KERNEL_TOP: + /* Kernel address with top bit cleared */ + bitmask &= (bitmask >> 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL_BOT: + /* Kernel address with bottom sign-extension bit cleared */ + bitmask &= (bitmask << 1); + ptr = (void *)((uint64_t)ptr | bitmask); + break; + case GET_USER_KERNEL: + /* Try to pass a kernel address */ + ptr = (void *)((uint64_t)ptr | bitmask); + break; + default: + printf("Invalid test case value passed!\n"); + break; + } + + /* + * Use FIOASYNC ioctl because it utilizes get_user() internally and is + * very non-invasive to the system. Pass differently tagged pointers to + * get_user() in order to verify that valid user pointers are going + * through and invalid kernel/non-canonical pointers are not. + */ + if (ioctl(fd, FIOASYNC, ptr) != 0) + ret = 1; + + close(fd); +error: + munmap(ptr, PAGE_SIZE); + return ret; +} + int sys_uring_setup(unsigned int entries, struct io_uring_params *p) { return (int)syscall(__NR_io_uring_setup, entries, p); @@ -596,8 +696,10 @@ int do_uring(unsigned long lam) fi->file_fd = file_fd; ring = malloc(sizeof(*ring)); - if (!ring) + if (!ring) { + free(fi); return 1; + } memset(ring, 0, sizeof(struct io_ring)); @@ -883,6 +985,33 @@ static struct testcases syscall_cases[] = { .test_func = handle_syscall, .msg = "SYSCALL:[Negative] Disable LAM. Dereferencing pointer with metadata.\n", }, + { + .later = GET_USER_USER, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER: get_user() and pass a properly tagged user pointer.\n", + }, + { + .later = GET_USER_KERNEL_TOP, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the top bit cleared.\n", + }, + { + .later = GET_USER_KERNEL_BOT, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() with a kernel pointer and the bottom sign-extension bit cleared.\n", + }, + { + .later = GET_USER_KERNEL, + .expected = 1, + .lam = LAM_U57_BITS, + .test_func = get_user_syscall, + .msg = "GET_USER:[Negative] get_user() and pass a kernel pointer.\n", + }, }; static struct testcases mmap_cases[] = { @@ -1181,10 +1310,8 @@ int main(int argc, char **argv) tests_cnt = 0; - if (!cpu_has_lam()) { - ksft_print_msg("Unsupported LAM feature!\n"); + if (!lam_is_available()) return KSFT_SKIP; - } while ((c = getopt(argc, argv, "ht:")) != -1) { switch (c) { diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index 3a29346e1452..bb99a71380a5 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -26,6 +26,8 @@ #include <asm/prctl.h> #include <sys/prctl.h> +#include "helpers.h" + #define AR_ACCESSED (1<<8) #define AR_TYPE_RODATA (0 * (1<<9)) @@ -506,20 +508,6 @@ static void fix_sa_restorer(int sig) } #endif -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); - - fix_sa_restorer(sig); -} - static jmp_buf jmpbuf; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) @@ -549,9 +537,11 @@ static void do_multicpu_tests(void) } sethandler(SIGSEGV, sigsegv, 0); + fix_sa_restorer(SIGSEGV); #ifdef __i386__ /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */ sethandler(SIGILL, sigsegv, 0); + fix_sa_restorer(SIGILL); #endif printf("[RUN]\tCross-CPU LDT invalidation\n"); diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c index cc3de6ff9fba..f22cb6b382f9 100644 --- a/tools/testing/selftests/x86/mov_ss_trap.c +++ b/tools/testing/selftests/x86/mov_ss_trap.c @@ -36,7 +36,7 @@ #include <setjmp.h> #include <sys/prctl.h> -#define X86_EFLAGS_RF (1UL << 16) +#include "helpers.h" #if __x86_64__ # define REG_IP REG_RIP @@ -94,18 +94,6 @@ static void enable_watchpoint(void) } } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static char const * const signames[] = { [SIGSEGV] = "SIGSEGV", [SIGBUS] = "SIBGUS", diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 12aaa063196e..360ec88d5432 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -15,6 +15,8 @@ #include <asm/ptrace-abi.h> #include <sys/auxv.h> +#include "helpers.h" + /* Bitness-agnostic defines for user_regs_struct fields. */ #ifdef __x86_64__ # define user_syscall_nr orig_rax @@ -93,18 +95,6 @@ static siginfo_t wait_trap(pid_t chld) return si; } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void setsigign(int sig, int flags) { struct sigaction sa; @@ -116,16 +106,6 @@ static void setsigign(int sig, int flags) err(1, "sigaction"); } -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - #ifdef __x86_64__ # define REG_BP REG_RBP #else diff --git a/tools/testing/selftests/x86/sigaltstack.c b/tools/testing/selftests/x86/sigaltstack.c index f689af75e979..0ae1b784498c 100644 --- a/tools/testing/selftests/x86/sigaltstack.c +++ b/tools/testing/selftests/x86/sigaltstack.c @@ -14,6 +14,8 @@ #include <sys/resource.h> #include <setjmp.h> +#include "helpers.h" + /* sigaltstack()-enforced minimum stack */ #define ENFORCED_MINSIGSTKSZ 2048 @@ -27,30 +29,6 @@ static bool sigalrm_expected; static unsigned long at_minstack_size; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static int setup_altstack(void *start, unsigned long size) { stack_t ss; diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 0b75b29f794b..26ef562f4232 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -46,6 +46,8 @@ #include <sys/ptrace.h> #include <sys/user.h> +#include "helpers.h" + /* Pull in AR_xyz defines. */ typedef unsigned int u32; typedef unsigned short u16; @@ -138,28 +140,6 @@ static unsigned short LDT3(int idx) return (idx << 3) | 7; } -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void add_ldt(const struct user_desc *desc, unsigned short *var, const char *name) { diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c index 9a30f443e928..280d7a22b9c9 100644 --- a/tools/testing/selftests/x86/single_step_syscall.c +++ b/tools/testing/selftests/x86/single_step_syscall.c @@ -33,28 +33,6 @@ #include "helpers.h" -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static volatile sig_atomic_t sig_traps, sig_eflags; sigjmp_buf jmpbuf; diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c index 48ab065a76f9..f67a2df335ba 100644 --- a/tools/testing/selftests/x86/syscall_arg_fault.c +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -17,18 +17,6 @@ #include "helpers.h" -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static sigjmp_buf jmpbuf; static volatile sig_atomic_t n_errs; diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c index a108b80dd082..f9c9814160f0 100644 --- a/tools/testing/selftests/x86/syscall_nt.c +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -18,18 +18,6 @@ static unsigned int nerrs; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static void sigtrap(int sig, siginfo_t *si, void *ctx_void) { } diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c index 991591718bb0..41c42b7b54a6 100644 --- a/tools/testing/selftests/x86/syscall_numbering.c +++ b/tools/testing/selftests/x86/syscall_numbering.c @@ -25,6 +25,7 @@ #include <sys/mman.h> #include <linux/ptrace.h> +#include "../kselftest.h" /* Common system call numbers */ #define SYS_READ 0 @@ -313,7 +314,7 @@ static void test_syscall_numbering(void) * The MSB is supposed to be ignored, so we loop over a few * to test that out. */ - for (size_t i = 0; i < sizeof(msbs)/sizeof(msbs[0]); i++) { + for (size_t i = 0; i < ARRAY_SIZE(msbs); i++) { int msb = msbs[i]; run("Checking system calls with msb = %d (0x%x)\n", msb, msb); diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c index b30de9aaa6d4..5fb531e3ad7c 100644 --- a/tools/testing/selftests/x86/sysret_rip.c +++ b/tools/testing/selftests/x86/sysret_rip.c @@ -22,6 +22,8 @@ #include <sys/mman.h> #include <assert.h> +#include "helpers.h" + /* * These items are in clang_helpers_64.S, in order to avoid clang inline asm * limitations: @@ -31,28 +33,6 @@ extern const char test_page[]; static void const *current_test_page_addr = test_page; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - -static void clearhandler(int sig) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = SIG_DFL; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - /* State used by our signal handlers. */ static gregset_t initial_regs; diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 6de11b4df458..05e1e6774fba 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -310,19 +310,6 @@ static void test_getcpu(int cpu) static jmp_buf jmpbuf; static volatile unsigned long segv_err; -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - ksft_exit_fail_msg("sigaction failed\n"); -} - static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { ucontext_t *ctx = (ucontext_t *)ctx_void; diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c index 4c311e1af4c7..9cc17588d818 100644 --- a/tools/testing/selftests/x86/unwind_vdso.c +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -43,18 +43,6 @@ int main() #include <dlfcn.h> #include <unwind.h> -static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), - int flags) -{ - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_sigaction = handler; - sa.sa_flags = SA_SIGINFO | flags; - sigemptyset(&sa.sa_mask); - if (sigaction(sig, &sa, 0)) - err(1, "sigaction"); -} - static volatile sig_atomic_t nerrs; static unsigned long sysinfo; static bool got_sysinfo = false; diff --git a/tools/testing/selftests/x86/xstate.c b/tools/testing/selftests/x86/xstate.c new file mode 100644 index 000000000000..23c1d6c964ea --- /dev/null +++ b/tools/testing/selftests/x86/xstate.c @@ -0,0 +1,477 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include <elf.h> +#include <pthread.h> +#include <stdbool.h> + +#include <asm/prctl.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/uio.h> +#include <sys/wait.h> + +#include "helpers.h" +#include "xstate.h" + +/* + * The userspace xstate test suite is designed to be generic and operates + * with randomized xstate data. However, some states require special handling: + * + * - PKRU and XTILECFG need specific adjustments, such as modifying + * randomization behavior or using fixed values. + * - But, PKRU already has a dedicated test suite in /tools/selftests/mm. + * - Legacy states (FP and SSE) are excluded, as they are not considered + * part of extended states (xstates) and their usage is already deeply + * integrated into user-space libraries. + */ +#define XFEATURE_MASK_TEST_SUPPORTED \ + ((1 << XFEATURE_YMM) | \ + (1 << XFEATURE_OPMASK) | \ + (1 << XFEATURE_ZMM_Hi256) | \ + (1 << XFEATURE_Hi16_ZMM) | \ + (1 << XFEATURE_XTILEDATA)) + +static inline uint64_t xgetbv(uint32_t index) +{ + uint32_t eax, edx; + + asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index)); + return eax + ((uint64_t)edx << 32); +} + +static inline uint64_t get_xstatebv(struct xsave_buffer *xbuf) +{ + return *(uint64_t *)(&xbuf->header); +} + +static struct xstate_info xstate; + +struct futex_info { + unsigned int iterations; + struct futex_info *next; + pthread_mutex_t mutex; + pthread_t thread; + bool valid; + int nr; +}; + +static inline void load_rand_xstate(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + set_xstatebv(xbuf, xstate->mask); + set_rand_data(xstate, xbuf); + xrstor(xbuf, xstate->mask); +} + +static inline void load_init_xstate(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + clear_xstate_header(xbuf); + xrstor(xbuf, xstate->mask); +} + +static inline void copy_xstate(struct xsave_buffer *xbuf_dst, struct xsave_buffer *xbuf_src) +{ + memcpy(&xbuf_dst->bytes[xstate.xbuf_offset], + &xbuf_src->bytes[xstate.xbuf_offset], + xstate.size); +} + +static inline bool validate_xstate_same(struct xsave_buffer *xbuf1, struct xsave_buffer *xbuf2) +{ + int ret; + + ret = memcmp(&xbuf1->bytes[xstate.xbuf_offset], + &xbuf2->bytes[xstate.xbuf_offset], + xstate.size); + return ret == 0; +} + +static inline bool validate_xregs_same(struct xsave_buffer *xbuf1) +{ + struct xsave_buffer *xbuf2; + bool ret; + + xbuf2 = alloc_xbuf(); + if (!xbuf2) + ksft_exit_fail_msg("failed to allocate XSAVE buffer\n"); + + xsave(xbuf2, xstate.mask); + ret = validate_xstate_same(xbuf1, xbuf2); + + free(xbuf2); + return ret; +} + +/* Context switching test */ + +static void *check_xstate(void *info) +{ + struct futex_info *finfo = (struct futex_info *)info; + struct xsave_buffer *xbuf; + int i; + + xbuf = alloc_xbuf(); + if (!xbuf) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + /* + * Load random data into 'xbuf' and then restore it to the xstate + * registers. + */ + load_rand_xstate(&xstate, xbuf); + finfo->valid = true; + + for (i = 0; i < finfo->iterations; i++) { + pthread_mutex_lock(&finfo->mutex); + + /* + * Ensure the register values have not diverged from the + * record. Then reload a new random value. If it failed + * ever before, skip it. + */ + if (finfo->valid) { + finfo->valid = validate_xregs_same(xbuf); + load_rand_xstate(&xstate, xbuf); + } + + /* + * The last thread's last unlock will be for thread 0's + * mutex. However, thread 0 will have already exited the + * loop and the mutex will already be unlocked. + * + * Because this is not an ERRORCHECK mutex, that + * inconsistency will be silently ignored. + */ + pthread_mutex_unlock(&finfo->next->mutex); + } + + free(xbuf); + return finfo; +} + +static void create_threads(uint32_t num_threads, uint32_t iterations, struct futex_info *finfo) +{ + int i; + + for (i = 0; i < num_threads; i++) { + int next_nr; + + finfo[i].nr = i; + finfo[i].iterations = iterations; + + /* + * Thread 'i' will wait on this mutex to be unlocked. + * Lock it immediately after initialization: + */ + pthread_mutex_init(&finfo[i].mutex, NULL); + pthread_mutex_lock(&finfo[i].mutex); + + next_nr = (i + 1) % num_threads; + finfo[i].next = &finfo[next_nr]; + + if (pthread_create(&finfo[i].thread, NULL, check_xstate, &finfo[i])) + ksft_exit_fail_msg("pthread_create() failed\n"); + } +} + +static bool checkout_threads(uint32_t num_threads, struct futex_info *finfo) +{ + void *thread_retval; + bool valid = true; + int err, i; + + for (i = 0; i < num_threads; i++) { + err = pthread_join(finfo[i].thread, &thread_retval); + if (err) + ksft_exit_fail_msg("pthread_join() failed for thread %d err: %d\n", i, err); + + if (thread_retval != &finfo[i]) { + ksft_exit_fail_msg("unexpected thread retval for thread %d: %p\n", + i, thread_retval); + } + + valid &= finfo[i].valid; + } + + return valid; +} + +static void affinitize_cpu0(void) +{ + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + ksft_exit_fail_msg("sched_setaffinity to CPU 0 failed\n"); +} + +static void test_context_switch(uint32_t num_threads, uint32_t iterations) +{ + struct futex_info *finfo; + + /* Affinitize to one CPU to force context switches */ + affinitize_cpu0(); + + printf("[RUN]\t%s: check context switches, %d iterations, %d threads.\n", + xstate.name, iterations, num_threads); + + finfo = malloc(sizeof(*finfo) * num_threads); + if (!finfo) + ksft_exit_fail_msg("unable allocate memory\n"); + + create_threads(num_threads, iterations, finfo); + + /* + * This thread wakes up thread 0 + * Thread 0 will wake up 1 + * Thread 1 will wake up 2 + * ... + * The last thread will wake up 0 + * + * This will repeat for the configured + * number of iterations. + */ + pthread_mutex_unlock(&finfo[0].mutex); + + /* Wait for all the threads to finish: */ + if (checkout_threads(num_threads, finfo)) + printf("[OK]\tNo incorrect case was found.\n"); + else + printf("[FAIL]\tFailed with context switching test.\n"); + + free(finfo); +} + +/* + * Ptrace test for the ABI format as described in arch/x86/include/asm/user.h + */ + +/* + * Make sure the ptracee has the expanded kernel buffer on the first use. + * Then, initialize the state before performing the state injection from + * the ptracer. For non-dynamic states, this is benign. + */ +static inline void ptracee_touch_xstate(void) +{ + struct xsave_buffer *xbuf; + + xbuf = alloc_xbuf(); + + load_rand_xstate(&xstate, xbuf); + load_init_xstate(&xstate, xbuf); + + free(xbuf); +} + +/* + * Ptracer injects the randomized xstate data. It also reads before and + * after that, which will execute the kernel's state copy functions. + */ +static void ptracer_inject_xstate(pid_t target) +{ + uint32_t xbuf_size = get_xbuf_size(); + struct xsave_buffer *xbuf1, *xbuf2; + struct iovec iov; + + /* + * Allocate buffers to keep data while ptracer can write the + * other buffer + */ + xbuf1 = alloc_xbuf(); + xbuf2 = alloc_xbuf(); + if (!xbuf1 || !xbuf2) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + iov.iov_base = xbuf1; + iov.iov_len = xbuf_size; + + if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_GETREGSET failed\n"); + + printf("[RUN]\t%s: inject xstate via ptrace().\n", xstate.name); + + load_rand_xstate(&xstate, xbuf1); + copy_xstate(xbuf2, xbuf1); + + if (ptrace(PTRACE_SETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_SETREGSET failed\n"); + + if (ptrace(PTRACE_GETREGSET, target, (uint32_t)NT_X86_XSTATE, &iov)) + ksft_exit_fail_msg("PTRACE_GETREGSET failed\n"); + + if (*(uint64_t *)get_fpx_sw_bytes(xbuf1) == xgetbv(0)) + printf("[OK]\t'xfeatures' in SW reserved area was correctly written\n"); + else + printf("[FAIL]\t'xfeatures' in SW reserved area was not correctly written\n"); + + if (validate_xstate_same(xbuf2, xbuf1)) + printf("[OK]\txstate was correctly updated.\n"); + else + printf("[FAIL]\txstate was not correctly updated.\n"); + + free(xbuf1); + free(xbuf2); +} + +static void test_ptrace(void) +{ + pid_t child; + int status; + + child = fork(); + if (child < 0) { + ksft_exit_fail_msg("fork() failed\n"); + } else if (!child) { + if (ptrace(PTRACE_TRACEME, 0, NULL, NULL)) + ksft_exit_fail_msg("PTRACE_TRACEME failed\n"); + + ptracee_touch_xstate(); + + raise(SIGTRAP); + _exit(0); + } + + do { + wait(&status); + } while (WSTOPSIG(status) != SIGTRAP); + + ptracer_inject_xstate(child); + + ptrace(PTRACE_DETACH, child, NULL, NULL); + wait(&status); + if (!WIFEXITED(status) || WEXITSTATUS(status)) + ksft_exit_fail_msg("ptracee exit error\n"); +} + +/* + * Test signal delivery for the ABI compatibility. + * See the ABI format: arch/x86/include/uapi/asm/sigcontext.h + */ + +/* + * Avoid using printf() in signal handlers as it is not + * async-signal-safe. + */ +#define SIGNAL_BUF_LEN 1000 +static char signal_message_buffer[SIGNAL_BUF_LEN]; +static void sig_print(char *msg) +{ + int left = SIGNAL_BUF_LEN - strlen(signal_message_buffer) - 1; + + strncat(signal_message_buffer, msg, left); +} + +static struct xsave_buffer *stashed_xbuf; + +static void validate_sigfpstate(int sig, siginfo_t *si, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t *)ctx_void; + void *xbuf = ctx->uc_mcontext.fpregs; + struct _fpx_sw_bytes *sw_bytes; + uint32_t magic2; + + /* Reset the signal message buffer: */ + signal_message_buffer[0] = '\0'; + + sw_bytes = get_fpx_sw_bytes(xbuf); + if (sw_bytes->magic1 == FP_XSTATE_MAGIC1) + sig_print("[OK]\t'magic1' is valid\n"); + else + sig_print("[FAIL]\t'magic1' is not valid\n"); + + if (get_fpx_sw_bytes_features(xbuf) & xstate.mask) + sig_print("[OK]\t'xfeatures' in SW reserved area is valid\n"); + else + sig_print("[FAIL]\t'xfeatures' in SW reserved area is not valid\n"); + + if (get_xstatebv(xbuf) & xstate.mask) + sig_print("[OK]\t'xfeatures' in XSAVE header is valid\n"); + else + sig_print("[FAIL]\t'xfeatures' in XSAVE header is not valid\n"); + + if (validate_xstate_same(stashed_xbuf, xbuf)) + sig_print("[OK]\txstate delivery was successful\n"); + else + sig_print("[FAIL]\txstate delivery was not successful\n"); + + magic2 = *(uint32_t *)(xbuf + sw_bytes->xstate_size); + if (magic2 == FP_XSTATE_MAGIC2) + sig_print("[OK]\t'magic2' is valid\n"); + else + sig_print("[FAIL]\t'magic2' is not valid\n"); + + set_rand_data(&xstate, xbuf); + copy_xstate(stashed_xbuf, xbuf); +} + +static void test_signal(void) +{ + bool valid_xstate; + + /* + * The signal handler will access this to verify xstate context + * preservation. + */ + stashed_xbuf = alloc_xbuf(); + if (!stashed_xbuf) + ksft_exit_fail_msg("unable to allocate XSAVE buffer\n"); + + printf("[RUN]\t%s: load xstate and raise SIGUSR1\n", xstate.name); + + sethandler(SIGUSR1, validate_sigfpstate, 0); + + load_rand_xstate(&xstate, stashed_xbuf); + + raise(SIGUSR1); + + /* + * Immediately record the test result, deferring printf() to + * prevent unintended state contamination by that. + */ + valid_xstate = validate_xregs_same(stashed_xbuf); + printf("%s", signal_message_buffer); + + printf("[RUN]\t%s: load new xstate from sighandler and check it after sigreturn\n", + xstate.name); + + if (valid_xstate) + printf("[OK]\txstate was restored correctly\n"); + else + printf("[FAIL]\txstate restoration failed\n"); + + clearhandler(SIGUSR1); + free(stashed_xbuf); +} + +void test_xstate(uint32_t feature_num) +{ + const unsigned int ctxtsw_num_threads = 5, ctxtsw_iterations = 10; + unsigned long features; + long rc; + + if (!(XFEATURE_MASK_TEST_SUPPORTED & (1 << feature_num))) { + ksft_print_msg("The xstate test does not fully support the component %u, yet.\n", + feature_num); + return; + } + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_SUPP, &features); + if (rc || !(features & (1 << feature_num))) { + ksft_print_msg("The kernel does not support feature number: %u\n", feature_num); + return; + } + + xstate = get_xstate_info(feature_num); + if (!xstate.size || !xstate.xbuf_offset) { + ksft_exit_fail_msg("invalid state size/offset (%d/%d)\n", + xstate.size, xstate.xbuf_offset); + } + + test_context_switch(ctxtsw_num_threads, ctxtsw_iterations); + test_ptrace(); + test_signal(); +} diff --git a/tools/testing/selftests/x86/xstate.h b/tools/testing/selftests/x86/xstate.h new file mode 100644 index 000000000000..42af36ec852f --- /dev/null +++ b/tools/testing/selftests/x86/xstate.h @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __SELFTESTS_X86_XSTATE_H +#define __SELFTESTS_X86_XSTATE_H + +#include <stdint.h> + +#include "../kselftest.h" + +#define XSAVE_HDR_OFFSET 512 +#define XSAVE_HDR_SIZE 64 + +/* + * List of XSAVE features Linux knows about. Copied from + * arch/x86/include/asm/fpu/types.h + */ +enum xfeature { + XFEATURE_FP, + XFEATURE_SSE, + XFEATURE_YMM, + XFEATURE_BNDREGS, + XFEATURE_BNDCSR, + XFEATURE_OPMASK, + XFEATURE_ZMM_Hi256, + XFEATURE_Hi16_ZMM, + XFEATURE_PT_UNIMPLEMENTED_SO_FAR, + XFEATURE_PKRU, + XFEATURE_PASID, + XFEATURE_CET_USER, + XFEATURE_CET_KERNEL_UNUSED, + XFEATURE_RSRVD_COMP_13, + XFEATURE_RSRVD_COMP_14, + XFEATURE_LBR, + XFEATURE_RSRVD_COMP_16, + XFEATURE_XTILECFG, + XFEATURE_XTILEDATA, + + XFEATURE_MAX, +}; + +/* Copied from arch/x86/kernel/fpu/xstate.c */ +static const char *xfeature_names[] = +{ + "x87 floating point registers", + "SSE registers", + "AVX registers", + "MPX bounds registers", + "MPX CSR", + "AVX-512 opmask", + "AVX-512 Hi256", + "AVX-512 ZMM_Hi256", + "Processor Trace (unused)", + "Protection Keys User registers", + "PASID state", + "Control-flow User registers", + "Control-flow Kernel registers (unused)", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "unknown xstate feature", + "AMX Tile config", + "AMX Tile data", + "unknown xstate feature", +}; + +struct xsave_buffer { + union { + struct { + char legacy[XSAVE_HDR_OFFSET]; + char header[XSAVE_HDR_SIZE]; + char extended[0]; + }; + char bytes[0]; + }; +}; + +static inline void xsave(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_hi = rfbm >> 32; + uint32_t rfbm_lo = rfbm; + + asm volatile("xsave (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi) + : "memory"); +} + +static inline void xrstor(struct xsave_buffer *xbuf, uint64_t rfbm) +{ + uint32_t rfbm_hi = rfbm >> 32; + uint32_t rfbm_lo = rfbm; + + asm volatile("xrstor (%%rdi)" + : : "D" (xbuf), "a" (rfbm_lo), "d" (rfbm_hi)); +} + +#define CPUID_LEAF_XSTATE 0xd +#define CPUID_SUBLEAF_XSTATE_USER 0x0 + +static inline uint32_t get_xbuf_size(void) +{ + uint32_t eax, ebx, ecx, edx; + + __cpuid_count(CPUID_LEAF_XSTATE, CPUID_SUBLEAF_XSTATE_USER, + eax, ebx, ecx, edx); + + /* + * EBX enumerates the size (in bytes) required by the XSAVE + * instruction for an XSAVE area containing all the user state + * components corresponding to bits currently set in XCR0. + */ + return ebx; +} + +struct xstate_info { + const char *name; + uint32_t num; + uint32_t mask; + uint32_t xbuf_offset; + uint32_t size; +}; + +static inline struct xstate_info get_xstate_info(uint32_t xfeature_num) +{ + struct xstate_info xstate = { }; + uint32_t eax, ebx, ecx, edx; + + if (xfeature_num >= XFEATURE_MAX) { + ksft_print_msg("unknown state\n"); + return xstate; + } + + xstate.name = xfeature_names[xfeature_num]; + xstate.num = xfeature_num; + xstate.mask = 1 << xfeature_num; + + __cpuid_count(CPUID_LEAF_XSTATE, xfeature_num, + eax, ebx, ecx, edx); + xstate.size = eax; + xstate.xbuf_offset = ebx; + return xstate; +} + +static inline struct xsave_buffer *alloc_xbuf(void) +{ + uint32_t xbuf_size = get_xbuf_size(); + + /* XSAVE buffer should be 64B-aligned. */ + return aligned_alloc(64, xbuf_size); +} + +static inline void clear_xstate_header(struct xsave_buffer *xbuf) +{ + memset(&xbuf->header, 0, sizeof(xbuf->header)); +} + +static inline void set_xstatebv(struct xsave_buffer *xbuf, uint64_t bv) +{ + /* XSTATE_BV is at the beginning of the header: */ + *(uint64_t *)(&xbuf->header) = bv; +} + +/* See 'struct _fpx_sw_bytes' at sigcontext.h */ +#define SW_BYTES_OFFSET 464 +/* N.B. The struct's field name varies so read from the offset. */ +#define SW_BYTES_BV_OFFSET (SW_BYTES_OFFSET + 8) + +static inline struct _fpx_sw_bytes *get_fpx_sw_bytes(void *xbuf) +{ + return xbuf + SW_BYTES_OFFSET; +} + +static inline uint64_t get_fpx_sw_bytes_features(void *buffer) +{ + return *(uint64_t *)(buffer + SW_BYTES_BV_OFFSET); +} + +static inline void set_rand_data(struct xstate_info *xstate, struct xsave_buffer *xbuf) +{ + int *ptr = (int *)&xbuf->bytes[xstate->xbuf_offset]; + int data, i; + + /* + * Ensure that 'data' is never 0. This ensures that + * the registers are never in their initial configuration + * and thus never tracked as being in the init state. + */ + data = rand() | 1; + + for (i = 0; i < xstate->size / sizeof(int); i++, ptr++) + *ptr = data; +} + +/* Testing kernel's context switching and ABI support for the xstate. */ +void test_xstate(uint32_t feature_num); + +#endif /* __SELFTESTS_X86_XSTATE_H */ |