From 1f6f4cb7ba219b00a3fa9afe8049fa16444d8b52 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:53 -0800 Subject: bpf: offload: rename the ifindex field bpf_target_prog seems long and clunky, rename it to prog_ifindex. We don't want to call this field just ifindex, because maps may need a similar field in the future and bpf_attr members for programs and maps are unnamed. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e880ae6434ee..3f626df42516 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -262,7 +262,7 @@ union bpf_attr { __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; - __u32 prog_target_ifindex; /* ifindex of netdev to prep for */ + __u32 prog_ifindex; /* ifindex of netdev to prep for */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 288b3de55aace830f13280985ec9e6bcbff33b1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:54 -0800 Subject: bpf: offload: move offload device validation out to the drivers With TC shared block changes we can't depend on correct netdev pointer being available in cls_bpf. Move the device validation to the driver. Core will only make sure that offloaded programs are always attached in the driver (or in HW by the driver). We trust that drivers which implement offload callbacks will perform necessary checks. Moving the checks to the driver is generally a useful thing, in practice the check should be against a switchdev instance, not a netdev, given that most ASICs will probably allow using the same program on many ports. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Jiri Pirko Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 10 ++++++++-- include/linux/bpf.h | 4 ++-- kernel/bpf/syscall.c | 23 ++++++++++++----------- net/core/dev.c | 7 ++----- net/sched/cls_bpf.c | 8 +++----- 5 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index b6cee71f49d3..bc879aeb62d4 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -214,8 +214,14 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog, { int err; - if (prog && !prog->aux->offload) - return -EINVAL; + if (prog) { + struct bpf_dev_offload *offload = prog->aux->offload; + + if (!offload) + return -EINVAL; + if (offload->netdev != nn->dp.netdev) + return -EINVAL; + } if (prog && old_prog) { u8 cap; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c397934f91dd..f82be640731e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -336,7 +336,7 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev); + bool attach_drv); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); void bpf_prog_sub(struct bpf_prog *prog, int i); struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); @@ -433,7 +433,7 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev) + bool attach_drv) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8e9d065bb7cd..38da55905ab0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1057,22 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_can_attach(struct bpf_prog *prog, - enum bpf_prog_type *attach_type, - struct net_device *netdev) +static bool bpf_prog_get_ok(struct bpf_prog *prog, + enum bpf_prog_type *attach_type, bool attach_drv) { - struct bpf_dev_offload *offload = prog->aux->offload; + /* not an attachment, just a refcount inc, always allow */ + if (!attach_type) + return true; if (prog->type != *attach_type) return false; - if (offload && offload->netdev != netdev) + if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) return false; return true; } static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, - struct net_device *netdev) + bool attach_drv) { struct fd f = fdget(ufd); struct bpf_prog *prog; @@ -1080,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, prog = ____bpf_prog_get(f); if (IS_ERR(prog)) return prog; - if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) { + if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { prog = ERR_PTR(-EINVAL); goto out; } @@ -1093,12 +1094,12 @@ out: struct bpf_prog *bpf_prog_get(u32 ufd) { - return __bpf_prog_get(ufd, NULL, NULL); + return __bpf_prog_get(ufd, NULL, false); } struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); @@ -1107,9 +1108,9 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) EXPORT_SYMBOL_GPL(bpf_prog_get_type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, - struct net_device *netdev) + bool attach_drv) { - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev); + struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv); if (!IS_ERR(prog)) trace_bpf_prog_get_type(prog); diff --git a/net/core/dev.c b/net/core/dev.c index 8ee29f4f5fa9..09525a27319c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7139,11 +7139,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, __dev_xdp_attached(dev, bpf_op, NULL)) return -EBUSY; - if (bpf_op == ops->ndo_bpf) - prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, - dev); - else - prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); + prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, + bpf_op == ops->ndo_bpf); if (IS_ERR(prog)) return PTR_ERR(prog); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index fb680dafac5a..a9f3e317055c 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -382,15 +382,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, { struct bpf_prog *fp; char *name = NULL; + bool skip_sw; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW; - if (gen_flags & TCA_CLS_FLAGS_SKIP_SW) - fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, - qdisc_dev(tp->q)); - else - fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS); + fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw); if (IS_ERR(fp)) return PTR_ERR(fp); -- cgit v1.2.3 From 479321e9c31a6c05426790b11888427400f75ac8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:56 -0800 Subject: bpf: turn bpf_prog_get_type() into a wrapper bpf_prog_get_type() is identical to bpf_prog_get_type_dev(), with false passed as attach_drv. Instead of keeping it as an exported symbol turn it into static inline wrapper. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 13 ++++++------- kernel/bpf/syscall.c | 10 ---------- 2 files changed, 6 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f82be640731e..37bbab8c0f56 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -334,7 +334,6 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; extern const struct bpf_verifier_ops xdp_analyzer_ops; struct bpf_prog *bpf_prog_get(u32 ufd); -struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv); struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); @@ -425,12 +424,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) return ERR_PTR(-EOPNOTSUPP); } -static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, - enum bpf_prog_type type) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) @@ -514,6 +507,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, } #endif /* CONFIG_BPF_SYSCALL */ +static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, + enum bpf_prog_type type) +{ + return bpf_prog_get_type_dev(ufd, type, false); +} + int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 38da55905ab0..41509cf825d8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1097,16 +1097,6 @@ struct bpf_prog *bpf_prog_get(u32 ufd) return __bpf_prog_get(ufd, NULL, false); } -struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) -{ - struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false); - - if (!IS_ERR(prog)) - trace_bpf_prog_get_type(prog); - return prog; -} -EXPORT_SYMBOL_GPL(bpf_prog_get_type); - struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, bool attach_drv) { -- cgit v1.2.3 From 1ee640095f049e7ac4ec36b985abada497b98cc2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:21:59 -0800 Subject: bpf: revert report offload info to user space This reverts commit bd601b6ada11 ("bpf: report offload info to user space"). The ifindex by itself is not sufficient, we should provide information on which network namespace this ifindex belongs to. After considering some options we concluded that it's best to just remove this API for now, and rework it in -next. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 - include/uapi/linux/bpf.h | 6 ------ kernel/bpf/offload.c | 12 ------------ kernel/bpf/syscall.c | 5 ----- 4 files changed, 24 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 37bbab8c0f56..76c577281d78 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -515,7 +515,6 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); -u32 bpf_prog_offload_ifindex(struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3f626df42516..4c223ab30293 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -897,10 +897,6 @@ enum sk_action { #define BPF_TAG_SIZE 8 -enum bpf_prog_status { - BPF_PROG_STATUS_DEV_BOUND = (1 << 0), -}; - struct bpf_prog_info { __u32 type; __u32 id; @@ -914,8 +910,6 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; - __u32 ifindex; - __u32 status; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index d4267c674fec..68ec884440b7 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } -u32 bpf_prog_offload_ifindex(struct bpf_prog *prog) -{ - struct bpf_dev_offload *offload = prog->aux->offload; - u32 ifindex; - - rtnl_lock(); - ifindex = offload->netdev ? offload->netdev->ifindex : 0; - rtnl_unlock(); - - return ifindex; -} - const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 41509cf825d8..2c4cfeaa8d5e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1616,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } - if (bpf_prog_is_dev_bound(prog->aux)) { - info.status |= BPF_PROG_STATUS_DEV_BOUND; - info.ifindex = bpf_prog_offload_ifindex(prog); - } - done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) -- cgit v1.2.3 From 1438019479349d262b76f8767ace3273d11b6dcb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2017 15:22:00 -0800 Subject: bpf: make bpf_prog_offload_verifier_prep() static inline Header implementation of bpf_prog_offload_verifier_prep() which is used if CONFIG_NET=n should be a static inline. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 07b96aaca256..b61482d354a2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -171,7 +171,7 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); #else -int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) +static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) { return -EOPNOTSUPP; } -- cgit v1.2.3 From db1ac4964fa172803a0fea83033cd35d380a8a77 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Wed, 22 Nov 2017 18:32:53 +0000 Subject: bpf: introduce ARG_PTR_TO_MEM_OR_NULL With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO and the verifier can prove the value of this next argument is 0. However, most helpers are just interested in handling , so forcing them to deal with makes the implementation of those helpers more complicated for no apparent benefits, requiring them to explicitly handle those corner cases with checks that bpf programs could start relying upon, preventing the possibility of removing them later. Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case. Currently, the only helper that needs this is bpf_csum_diff_proto(), so change arg1 and arg3 to this new type as well. Also add a new battery of tests that explicitly test the !ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the various variations are focused on bpf_csum_diff, so cover also other helpers. Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 4 +- net/core/filter.c | 4 +- tools/testing/selftests/bpf/test_verifier.c | 113 ++++++++++++++++++++++++++-- 4 files changed, 112 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 76c577281d78..e55e4255a210 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -78,6 +78,7 @@ enum bpf_arg_type { * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ + ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dd54d20ace2f..308b0638ec5d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1384,13 +1384,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (type != expected_type) goto err_type; } else if (arg_type == ARG_PTR_TO_MEM || + arg_type == ARG_PTR_TO_MEM_OR_NULL || arg_type == ARG_PTR_TO_UNINIT_MEM) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg)) + if (register_is_null(*reg) && + arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && diff --git a/net/core/filter.c b/net/core/filter.c index 1afa17935954..6a85e67fafce 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1646,9 +1646,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM_OR_NULL, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 2a5267bef160..3c64f30cf63c 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -5631,7 +5631,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, { - "helper access to variable memory: size = 0 allowed on NULL", + "helper access to variable memory: size = 0 allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), @@ -5645,7 +5645,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to variable memory: size > 0 not allowed on NULL", + "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_MOV64_IMM(BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_2, 0), @@ -5663,7 +5663,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to variable memory: size = 0 allowed on != NULL stack pointer", + "helper access to variable memory: size = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), @@ -5680,7 +5680,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to variable memory: size = 0 allowed on != NULL map pointer", + "helper access to variable memory: size = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -5702,7 +5702,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to variable memory: size possible = 0 allowed on != NULL stack pointer", + "helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -5727,7 +5727,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to variable memory: size possible = 0 allowed on != NULL map pointer", + "helper access to variable memory: size possible = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), @@ -5750,7 +5750,7 @@ static struct bpf_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "helper access to variable memory: size possible = 0 allowed on != NULL packet pointer", + "helper access to variable memory: size possible = 0 allowed on != NULL packet pointer (ARG_PTR_TO_MEM_OR_NULL)", .insns = { BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct __sk_buff, data)), @@ -5771,6 +5771,105 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, + { + "helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .errstr = "R1 type=inv expected=fp", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: size > 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)", + .insns = { + BPF_MOV64_IMM(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .errstr = "R1 type=inv expected=fp", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: size = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)", + .insns = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: size = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "helper access to variable memory: size possible = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_probe_read), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, { "helper access to variable memory: 8 bytes leak", .insns = { -- cgit v1.2.3 From c131187db2d3fa2f8bf32fdf4e9a4ef805168467 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 22 Nov 2017 16:42:05 -0800 Subject: bpf: fix branch pruning logic when the verifier detects that register contains a runtime constant and it's compared with another constant it will prune exploration of the branch that is guaranteed not to be taken at runtime. This is all correct, but malicious program may be constructed in such a way that it always has a constant comparison and the other branch is never taken under any conditions. In this case such path through the program will not be explored by the verifier. It won't be taken at run-time either, but since all instructions are JITed the malicious program may cause JITs to complain about using reserved fields, etc. To fix the issue we have to track the instructions explored by the verifier and sanitize instructions that are dead at run time with NOPs. We cannot reject such dead code, since llvm generates it for valid C code, since it doesn't do as much data flow analysis as the verifier does. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b61482d354a2..c561b986bab0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -115,7 +115,7 @@ struct bpf_insn_aux_data { struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int converted_op_size; /* the valid value width after perceived conversion */ + bool seen; /* this insn was processed by the verifier */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 308b0638ec5d..d4593571c404 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3827,6 +3827,7 @@ static int do_check(struct bpf_verifier_env *env) return err; regs = cur_regs(env); + env->insn_aux_data[insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -4022,6 +4023,7 @@ process_bpf_exit: return err; insn_idx++; + env->insn_aux_data[insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -4204,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + int i; if (cnt == 1) return 0; @@ -4213,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + for (i = off; i < off + cnt - 1; i++) + new_data[i].seen = true; env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -4231,6 +4236,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } +/* The verifier does more data flow analysis than llvm and will not explore + * branches that are dead at run time. Malicious programs can have dead code + * too. Therefore replace all dead at-run-time code with nops. + */ +static void sanitize_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++) { + if (aux_data[i].seen) + continue; + memcpy(insn + i, &nop, sizeof(nop)); + } +} + /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -4557,6 +4581,9 @@ skip_full_check: while (!pop_stack(env, NULL, NULL)); free_states(env); + if (ret == 0) + sanitize_dead_code(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); -- cgit v1.2.3 From 0c19f846d582af919db66a5914a0189f9f92c936 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 21 Nov 2017 10:22:25 -0500 Subject: net: accept UFO datagrams from tuntap and packet Tuntap and similar devices can inject GSO packets. Accept type VIRTIO_NET_HDR_GSO_UDP, even though not generating UFO natively. Processes are expected to use feature negotiation such as TUNSETOFFLOAD to detect supported offload types and refrain from injecting other packets. This process breaks down with live migration: guest kernels do not renegotiate flags, so destination hosts need to expose all features that the source host does. Partially revert the UFO removal from 182e0b6b5846~1..d9d30adf5677. This patch introduces nearly(*) no new code to simplify verification. It brings back verbatim tuntap UFO negotiation, VIRTIO_NET_HDR_GSO_UDP insertion and software UFO segmentation. It does not reinstate protocol stack support, hardware offload (NETIF_F_UFO), SKB_GSO_UDP tunneling in SKB_GSO_SOFTWARE or reception of VIRTIO_NET_HDR_GSO_UDP packets in tuntap. To support SKB_GSO_UDP reappearing in the stack, also reinstate logic in act_csum and openvswitch. Achieve equivalence with v4.13 HEAD by squashing in commit 939912216fa8 ("net: skb_needs_check() removes CHECKSUM_UNNECESSARY check for tx.") and reverting commit 8d63bee643f1 ("net: avoid skb_warn_bad_offload false positives on UFO"). (*) To avoid having to bring back skb_shinfo(skb)->ip6_frag_id, ipv6_proxy_select_ident is changed to return a __be32 and this is assigned directly to the frag_hdr. Also, SKB_GSO_UDP is inserted at the end of the enum to minimize code churn. Tested Booted a v4.13 guest kernel with QEMU. On a host kernel before this patch `ethtool -k eth0` shows UFO disabled. After the patch, it is enabled, same as on a v4.13 host kernel. A UFO packet sent from the guest appears on the tap device: host: nc -l -p -u 8000 & tcpdump -n -i tap0 guest: dd if=/dev/zero of=payload.txt bs=1 count=2000 nc -u 192.16.1.1 8000 < payload.txt Direct tap to tap transmission of VIRTIO_NET_HDR_GSO_UDP succeeds, packets arriving fragmented: ./with_tap_pair.sh ./tap_send_ufo tap0 tap1 (from https://github.com/wdebruij/kerneltools/tree/master/tests) Changes v1 -> v2 - simplified set_offload change (review comment) - documented test procedure Link: http://lkml.kernel.org/r/ Fixes: fb652fdfe837 ("macvlan/macvtap: Remove NETIF_F_UFO advertisement.") Reported-by: Michal Kubecek Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/tap.c | 2 +- drivers/net/tun.c | 2 + include/linux/netdev_features.h | 4 +- include/linux/netdevice.h | 1 + include/linux/skbuff.h | 2 + include/linux/virtio_net.h | 5 ++- include/net/ipv6.h | 1 + net/core/dev.c | 3 +- net/ipv4/af_inet.c | 12 +++++- net/ipv4/udp_offload.c | 49 ++++++++++++++++++++++-- net/ipv6/output_core.c | 31 +++++++++++++++ net/ipv6/udp_offload.c | 85 +++++++++++++++++++++++++++++++++++++++-- net/openvswitch/datapath.c | 14 +++++++ net/openvswitch/flow.c | 6 ++- net/sched/act_csum.c | 6 +++ 15 files changed, 209 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index b13890953ebb..e9489b88407c 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1077,7 +1077,7 @@ static long tap_ioctl(struct file *file, unsigned int cmd, case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | - TUN_F_TSO_ECN)) + TUN_F_TSO_ECN | TUN_F_UFO)) return -EINVAL; rtnl_lock(); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 5a2ea78a008f..6a7bde9bc4b2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2370,6 +2370,8 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } + + arg &= ~TUN_F_UFO; } /* This gives the user a way to test for new features in future by diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index dc8b4896b77b..b1b0ca7ccb2b 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -54,8 +54,9 @@ enum { NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ + NETIF_F_GSO_UDP_BIT, /* ... UFO, deprecated except tuntap */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_ESP_BIT, + NETIF_F_GSO_UDP_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -132,6 +133,7 @@ enum { #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) #define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) #define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP) +#define NETIF_F_GSO_UDP __NETIF_F(GSO_UDP) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6b274bfe489f..ef789e1d679e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4140,6 +4140,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ed06e1c28fc7..bc486ef23f20 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -568,6 +568,8 @@ enum { SKB_GSO_SCTP = 1 << 14, SKB_GSO_ESP = 1 << 15, + + SKB_GSO_UDP = 1 << 16, }; #if BITS_PER_LONG > 32 diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 210034c896e3..f144216febc6 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -9,7 +9,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { - unsigned short gso_type = 0; + unsigned int gso_type = 0; if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -19,6 +19,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, case VIRTIO_NET_HDR_GSO_TCPV6: gso_type = SKB_GSO_TCPV6; break; + case VIRTIO_NET_HDR_GSO_UDP: + gso_type = SKB_GSO_UDP; + break; default: return -EINVAL; } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index ec14f0d5a3a1..f73797e2fa60 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -767,6 +767,7 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); diff --git a/net/core/dev.c b/net/core/dev.c index 8ee29f4f5fa9..bbba19112f02 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2746,7 +2746,8 @@ EXPORT_SYMBOL(skb_mac_gso_segment); static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) - return skb->ip_summed != CHECKSUM_PARTIAL; + return skb->ip_summed != CHECKSUM_PARTIAL && + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index ce4aa827be05..f00499a46927 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1223,9 +1223,10 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool fixedid = false, gso_partial, encap; + bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; + unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; @@ -1260,6 +1261,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ @@ -1279,7 +1281,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) { + if (udpfrag) { + iph->frag_off = htons(offset >> 3); + if (skb->next) + iph->frag_off |= htons(IP_MF); + offset += skb->len - nhoff - ihl; + tot_len = skb->len - nhoff; + } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e360d55be555..01801b77bd0d 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -187,16 +187,57 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); -static struct sk_buff *udp4_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + __wsum csum; + struct udphdr *uh; + struct iphdr *iph; if (skb->encapsulation && (skb_shinfo(skb)->gso_type & - (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) + (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))) { segs = skb_udp_tunnel_segment(skb, features, false); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as + * HW cannot do checksum of UDP packets sent as multiple + * IP fragments. + */ + uh = udp_hdr(skb); + iph = ip_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v4_check(skb->len, iph->saddr, iph->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Fragment the skb. IP headers of the fragments are updated in + * inet_gso_segment() + */ + segs = skb_segment(skb, features); +out: return segs; } @@ -330,7 +371,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv4_offload = { .callbacks = { - .gso_segment = udp4_tunnel_segment, + .gso_segment = udp4_ufo_fragment, .gro_receive = udp4_gro_receive, .gro_complete = udp4_gro_complete, }, diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 4a7e5ffa5108..4fe7c90962dd 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -31,6 +31,37 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd, return id; } +/* This function exists only for tap drivers that must support broken + * clients requesting UFO without specifying an IPv6 fragment ID. + * + * This is similar to ipv6_select_ident() but we use an independent hash + * seed to limit information leakage. + * + * The network header must be set before calling this. + */ +__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) +{ + static u32 ip6_proxy_idents_hashrnd __read_mostly; + struct in6_addr buf[2]; + struct in6_addr *addrs; + u32 id; + + addrs = skb_header_pointer(skb, + skb_network_offset(skb) + + offsetof(struct ipv6hdr, saddr), + sizeof(buf), buf); + if (!addrs) + return 0; + + net_get_random_once(&ip6_proxy_idents_hashrnd, + sizeof(ip6_proxy_idents_hashrnd)); + + id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd, + &addrs[1], &addrs[0]); + return htonl(id); +} +EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); + __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 455fd4e39333..a0f89ad76f9d 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -17,15 +17,94 @@ #include #include "ip6_offload.h" -static struct sk_buff *udp6_tunnel_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, + netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); + unsigned int mss; + unsigned int unfrag_ip6hlen, unfrag_len; + struct frag_hdr *fptr; + u8 *packet_start, *prevhdr; + u8 nexthdr; + u8 frag_hdr_sz = sizeof(struct frag_hdr); + __wsum csum; + int tnl_hlen; + int err; + + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; if (skb->encapsulation && skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM)) segs = skb_udp_tunnel_segment(skb, features, true); + else { + const struct ipv6hdr *ipv6h; + struct udphdr *uh; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto out; + + /* Do software UFO. Complete and fill in the UDP checksum as HW cannot + * do checksum of UDP packets sent as multiple IP fragments. + */ + + uh = udp_hdr(skb); + ipv6h = ipv6_hdr(skb); + + uh->check = 0; + csum = skb_checksum(skb, 0, skb->len, 0); + uh->check = udp_v6_check(skb->len, &ipv6h->saddr, + &ipv6h->daddr, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + + /* Check if there is enough headroom to insert fragment header. */ + tnl_hlen = skb_tnl_header_len(skb); + if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } + + /* Find the unfragmentable header and shift it left by frag_hdr_sz + * bytes to insert fragment header. + */ + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + return ERR_PTR(err); + unfrag_ip6hlen = err; + nexthdr = *prevhdr; + *prevhdr = NEXTHDR_FRAGMENT; + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; + skb->mac_header -= frag_hdr_sz; + skb->network_header -= frag_hdr_sz; + + fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); + fptr->nexthdr = nexthdr; + fptr->reserved = 0; + fptr->identification = ipv6_proxy_select_ident(dev_net(skb->dev), skb); + + /* Fragment the skb. ipv6 header and the remaining fields of the + * fragment header are updated in ipv6_gso_segment() + */ + segs = skb_segment(skb, features); + } +out: return segs; } @@ -75,7 +154,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) static const struct net_offload udpv6_offload = { .callbacks = { - .gso_segment = udp6_tunnel_segment, + .gso_segment = udp6_ufo_fragment, .gro_receive = udp6_gro_receive, .gro_complete = udp6_gro_complete, }, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0dab33fb9844..99cfafc2a139 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -308,6 +308,8 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct dp_upcall_info *upcall_info, uint32_t cutlen) { + unsigned short gso_type = skb_shinfo(skb)->gso_type; + struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; @@ -318,9 +320,21 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (segs == NULL) return -EINVAL; + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } + /* Queue all of the segments. */ skb = segs; do { + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 864ddb1e3642..dbe2379329c5 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -631,7 +631,8 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_LATER; return 0; } - if (nh->frag_off & htons(IP_MF)) + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; else key->ip.frag = OVS_FRAG_TYPE_NONE; @@ -747,6 +748,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) if (key->ip.frag == OVS_FRAG_TYPE_LATER) return 0; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { if (tcphdr_ok(skb)) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 1c40caadcff9..d836f998117b 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -229,6 +229,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, const struct iphdr *iph; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, @@ -282,6 +285,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, const struct ipv6hdr *ip6h; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, -- cgit v1.2.3