diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/Makefile | 2 | ||||
-rw-r--r-- | net/mptcp/ctrl.c | 143 | ||||
-rw-r--r-- | net/mptcp/diag.c | 42 | ||||
-rw-r--r-- | net/mptcp/fastopen.c | 27 | ||||
-rw-r--r-- | net/mptcp/options.c | 7 | ||||
-rw-r--r-- | net/mptcp/pm.c | 666 | ||||
-rw-r--r-- | net/mptcp/pm_kernel.c | 1412 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 1927 | ||||
-rw-r--r-- | net/mptcp/pm_userspace.c | 269 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 332 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 99 | ||||
-rw-r--r-- | net/mptcp/sched.c | 39 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 28 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 36 |
14 files changed, 2586 insertions, 2443 deletions
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index bcf1dbf3a432..89bf6c47c818 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o \ - mptcp_pm_gen.o + mptcp_pm_gen.o pm_kernel.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 2dd81e6c26bd..d9290c5bb6c7 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -39,6 +39,7 @@ struct mptcp_pernet { u8 allow_join_initial_addr_port; u8 pm_type; char scheduler[MPTCP_SCHED_NAME_MAX]; + char path_manager[MPTCP_PM_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -83,6 +84,11 @@ int mptcp_get_pm_type(const struct net *net) return mptcp_get_pernet(net)->pm_type; } +const char *mptcp_get_path_manager(const struct net *net) +{ + return mptcp_get_pernet(net)->path_manager; +} + const char *mptcp_get_scheduler(const struct net *net) { return mptcp_get_pernet(net)->scheduler; @@ -101,6 +107,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler)); + strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager)); } #ifdef CONFIG_SYSCTL @@ -174,6 +181,96 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, return ret; } +static int mptcp_set_path_manager(char *path_manager, const char *name) +{ + struct mptcp_pm_ops *pm_ops; + int ret = 0; + + rcu_read_lock(); + pm_ops = mptcp_pm_find(name); + if (pm_ops) + strscpy(path_manager, name, MPTCP_PM_NAME_MAX); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + +static int proc_path_manager(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct mptcp_pernet *pernet = container_of(ctl->data, + struct mptcp_pernet, + path_manager); + char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data; + char pm_name[MPTCP_PM_NAME_MAX]; + const struct ctl_table tbl = { + .data = pm_name, + .maxlen = MPTCP_PM_NAME_MAX, + }; + int ret; + + strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + ret = mptcp_set_path_manager(*path_manager, pm_name); + if (ret == 0) { + u8 pm_type = __MPTCP_PM_TYPE_NR; + + if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0) + pm_type = MPTCP_PM_TYPE_KERNEL; + else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0) + pm_type = MPTCP_PM_TYPE_USERSPACE; + pernet->pm_type = pm_type; + } + } + + return ret; +} + +static int proc_pm_type(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct mptcp_pernet *pernet = container_of(ctl->data, + struct mptcp_pernet, + pm_type); + int ret; + + ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos); + if (write && ret == 0) { + u8 pm_type = READ_ONCE(*(u8 *)ctl->data); + char *pm_name = ""; + + if (pm_type == MPTCP_PM_TYPE_KERNEL) + pm_name = "kernel"; + else if (pm_type == MPTCP_PM_TYPE_USERSPACE) + pm_name = "userspace"; + mptcp_set_path_manager(pernet->path_manager, pm_name); + } + + return ret; +} + +static int proc_available_path_managers(const struct ctl_table *ctl, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + + return ret; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -218,7 +315,7 @@ static struct ctl_table mptcp_sysctl_table[] = { .procname = "pm_type", .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dou8vec_minmax, + .proc_handler = proc_pm_type, .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, @@ -253,6 +350,18 @@ static struct ctl_table mptcp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, + { + .procname = "path_manager", + .maxlen = MPTCP_PM_NAME_MAX, + .mode = 0644, + .proc_handler = proc_path_manager, + }, + { + .procname = "available_path_managers", + .maxlen = MPTCP_PM_BUF_MAX, + .mode = 0444, + .proc_handler = proc_available_path_managers, + }, }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -278,6 +387,8 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[8].data = &pernet->close_timeout; table[9].data = &pernet->blackhole_timeout; table[10].data = &pernet->syn_retrans_before_tcp_fallback; + table[11].data = &pernet->path_manager; + /* table[12] is for available_path_managers which is read-only info */ hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); @@ -401,26 +512,30 @@ void mptcp_active_enable(struct sock *sk) void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) { struct mptcp_subflow_context *subflow; + u8 timeouts, to_max; + struct net *net; - if (!sk_is_mptcp(ssk)) + /* Only check MPTCP SYN ... */ + if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) return; subflow = mptcp_subflow_ctx(ssk); - if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) { - struct net *net = sock_net(ssk); - u8 timeouts, to_max; + /* ... + MP_CAPABLE */ + if (!subflow->request_mptcp) { + /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ + subflow->mpc_drop = 0; + return; + } - timeouts = inet_csk(ssk)->icsk_retransmits; - to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; + net = sock_net(ssk); + timeouts = inet_csk(ssk)->icsk_retransmits; + to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; - if (timeouts == to_max || (timeouts < to_max && expired)) { - MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); - subflow->mpc_drop = 1; - mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } - } else if (ssk->sk_state == TCP_SYN_SENT) { - subflow->mpc_drop = 0; + if (timeouts == to_max || (timeouts < to_max && expired)) { + MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); + subflow->mpc_drop = 1; + mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); } } diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 02205f7994d7..70cf9ebce833 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -12,7 +12,7 @@ #include <net/netlink.h> #include "protocol.h" -static int subflow_get_info(struct sock *sk, struct sk_buff *skb) +static int subflow_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) { struct mptcp_subflow_context *sf; struct nlattr *start; @@ -56,15 +56,6 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, - sf->rel_write_seq) || - nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, - MPTCP_SUBFLOW_ATTR_PAD) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, - sf->map_subflow_seq) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || - nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, - sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) { @@ -72,6 +63,21 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) goto nla_failure; } + /* Only export seq related counters to user with CAP_NET_ADMIN */ + if (net_admin && + (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + sf->rel_write_seq) || + nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, + MPTCP_SUBFLOW_ATTR_PAD) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + sf->map_subflow_seq) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || + nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + sf->map_data_len))) { + err = -EMSGSIZE; + goto nla_failure; + } + rcu_read_unlock(); unlock_sock_fast(sk, slow); nla_nest_end(skb, start); @@ -84,22 +90,26 @@ nla_failure: return err; } -static size_t subflow_get_info_size(const struct sock *sk) +static size_t subflow_get_info_size(const struct sock *sk, bool net_admin) { size_t size = 0; size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ - nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ - nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ 0; + + if (net_admin) + size += nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ + nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ + 0; + return size; } diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index a29ff901df75..b9e451197902 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -40,17 +40,17 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf tp->copied_seq += skb->len; subflow->ssn_offset += skb->len; - /* initialize a dummy sequence number, we will update it at MPC - * completion, if needed - */ + /* Only the sequence delta is relevant */ MPTCP_SKB_CB(skb)->map_seq = -skb->len; MPTCP_SKB_CB(skb)->end_seq = 0; MPTCP_SKB_CB(skb)->offset = 0; MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + MPTCP_SKB_CB(skb)->cant_coalesce = 1; mptcp_data_lock(sk); + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); - mptcp_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); mptcp_sk(sk)->bytes_received += skb->len; @@ -58,22 +58,3 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf mptcp_data_unlock(sk); } - -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt) -{ - struct sock *sk = (struct sock *)msk; - struct sk_buff *skb; - - skb = skb_peek_tail(&sk->sk_receive_queue); - if (skb) { - WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, - MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, - MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); - MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; - MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; - } - - pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); -} diff --git a/net/mptcp/options.c b/net/mptcp/options.c index fd2de185bc93..421ced031289 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -432,7 +432,6 @@ static void clear_3rdack_retransmission(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); - icsk->icsk_ack.timeout = 0; icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } @@ -651,6 +650,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; + struct mptcp_addr_info addr; bool echo; int len; @@ -659,7 +659,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; @@ -672,7 +672,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * else if (opts->suboptions & OPTION_MPTCP_DSS) return false; - len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); + len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; @@ -689,6 +689,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = 0; *size -= opt_size; } + opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 16c336c51940..18b19dbccbba 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -5,12 +5,390 @@ */ #define pr_fmt(fmt) "MPTCP: " fmt -#include <linux/kernel.h> -#include <net/mptcp.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> #include "protocol.h" - #include "mib.h" +#define ADD_ADDR_RETRANS_MAX 3 + +struct mptcp_pm_add_entry { + struct list_head list; + struct mptcp_addr_info addr; + u8 retrans_times; + struct timer_list add_timer; + struct mptcp_sock *sock; +}; + +static DEFINE_SPINLOCK(mptcp_pm_list_lock); +static LIST_HEAD(mptcp_pm_list); + +/* path manager helpers */ + +/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, + * otherwise allow any matching local/remote pair + */ +bool mptcp_pm_addr_families_match(const struct sock *sk, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *rem) +{ + bool mptcp_is_v4 = sk->sk_family == AF_INET; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); + bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); + + if (mptcp_is_v4) + return loc_is_v4 && rem_is_v4; + + if (ipv6_only_sock(sk)) + return !loc_is_v4 && !rem_is_v4; + + return loc_is_v4 == rem_is_v4; +#else + return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; +#endif +} + +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port) +{ + bool addr_equals = false; + + if (a->family == b->family) { + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); + } else if (a->family == AF_INET) { + if (ipv6_addr_v4mapped(&b->addr6)) + addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; + } else if (b->family == AF_INET) { + if (ipv6_addr_v4mapped(&a->addr6)) + addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; +#endif + } + + if (!addr_equals) + return false; + if (!use_port) + return true; + + return a->port == b->port; +} + +void mptcp_local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = htons(skc->skc_num); + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_rcv_saddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_rcv_saddr; +#endif +} + +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = skc->skc_dport; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_daddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_daddr; +#endif +} + +static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) +{ + struct mptcp_addr_info mpc_remote; + + mptcp_remote_address((struct sock_common *)msk, &mpc_remote); + return mptcp_addresses_equal(&mpc_remote, remote, remote->port); +} + +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + mptcp_local_address(skc, &cur); + if (mptcp_addresses_equal(&cur, saddr, saddr->port)) + return true; + } + + return false; +} + +static struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *entry; + + lockdep_assert_held(&msk->pm.lock); + + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (mptcp_addresses_equal(&entry->addr, addr, true)) + return entry; + } + + return NULL; +} + +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *entry; + + entry = mptcp_pm_del_add_timer(msk, addr, false); + kfree(entry); + return entry; +} + +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) +{ + struct mptcp_pm_add_entry *entry; + struct mptcp_addr_info saddr; + bool ret = false; + + mptcp_local_address((struct sock_common *)sk, &saddr); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { + ret = true; + goto out; + } + } + +out: + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +static void __mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + pr_debug("send ack for %s\n", + prio ? "mp_prio" : + (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); + + slow = lock_sock_fast(ssk); + if (prio) { + subflow->send_mp_prio = 1; + subflow->request_bkup = backup; + } + + __mptcp_subflow_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + +void mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + spin_unlock_bh(&msk->pm.lock); + __mptcp_pm_send_ack(msk, subflow, prio, backup); + spin_lock_bh(&msk->pm.lock); +} + +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *alt = NULL; + + msk_owned_by_me(msk); + lockdep_assert_held(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal(msk) && + !mptcp_pm_should_rm_signal(msk)) + return; + + mptcp_for_each_subflow(msk, subflow) { + if (__mptcp_subflow_active(subflow)) { + if (!subflow->stale) { + mptcp_pm_send_ack(msk, subflow, false, false); + return; + } + + if (!alt) + alt = subflow; + } + } + + if (alt) + mptcp_pm_send_ack(msk, alt, false, false); +} + +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) +{ + struct mptcp_subflow_context *subflow; + + pr_debug("bkup=%d\n", bkup); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_addr_info local, remote; + + mptcp_local_address((struct sock_common *)ssk, &local); + if (!mptcp_addresses_equal(&local, addr, addr->port)) + continue; + + if (rem && rem->family != AF_UNSPEC) { + mptcp_remote_address((struct sock_common *)ssk, &remote); + if (!mptcp_addresses_equal(&remote, rem, rem->port)) + continue; + } + + __mptcp_pm_send_ack(msk, subflow, true, bkup); + return 0; + } + + return -EINVAL; +} + +static void mptcp_pm_add_timer(struct timer_list *timer) +{ + struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); + struct mptcp_sock *msk = entry->sock; + struct sock *sk = (struct sock *)msk; + + pr_debug("msk=%p\n", msk); + + if (!msk) + return; + + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; + + if (!entry->addr.id) + return; + + if (mptcp_pm_should_add_signal_addr(msk)) { + sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); + goto out; + } + + spin_lock_bh(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal_addr(msk)) { + pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); + mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_add_addr_send_ack(msk); + entry->retrans_times++; + } + + if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + + spin_unlock_bh(&msk->pm.lock); + + if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) + mptcp_pm_subflow_established(msk); + +out: + __sock_put(sk); +} + +struct mptcp_pm_add_entry * +mptcp_pm_del_add_timer(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr, bool check_id) +{ + struct mptcp_pm_add_entry *entry; + struct sock *sk = (struct sock *)msk; + struct timer_list *add_timer = NULL; + + spin_lock_bh(&msk->pm.lock); + entry = mptcp_lookup_anno_list_by_saddr(msk, addr); + if (entry && (!check_id || entry->addr.id == addr->id)) { + entry->retrans_times = ADD_ADDR_RETRANS_MAX; + add_timer = &entry->add_timer; + } + if (!check_id && entry) + list_del(&entry->list); + spin_unlock_bh(&msk->pm.lock); + + /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ + if (add_timer) + sk_stop_timer_sync(sk, add_timer); + + return entry; +} + +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *add_entry = NULL; + struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); + + lockdep_assert_held(&msk->pm.lock); + + add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); + + if (add_entry) { + if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) + return false; + + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + return true; + } + + add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); + if (!add_entry) + return false; + + list_add(&add_entry->list, &msk->pm.anno_list); + + add_entry->addr = *addr; + add_entry->sock = msk; + add_entry->retrans_times = 0; + + timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + + return true; +} + +static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) +{ + struct mptcp_pm_add_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; + LIST_HEAD(free_list); + + pr_debug("msk=%p\n", msk); + + spin_lock_bh(&msk->pm.lock); + list_splice_init(&msk->pm.anno_list, &free_list); + spin_unlock_bh(&msk->pm.lock); + + list_for_each_entry_safe(entry, tmp, &free_list, list) { + sk_stop_timer_sync(sk, &entry->add_timer); + kfree(entry); + } +} + /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -56,7 +434,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); return 0; } @@ -138,13 +516,13 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) * be sure to serve this event only once. */ if (READ_ONCE(pm->work_pending) && - !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) + !(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); - if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) + if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) announce = true; - msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); + pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); if (announce) @@ -230,7 +608,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } /* id0 should not have a different address */ - } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || + } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); @@ -250,6 +628,9 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, pr_debug("msk=%p\n", msk); + if (!READ_ONCE(pm->work_pending)) + return; + spin_lock_bh(&pm->lock); if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) @@ -266,6 +647,80 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } +static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = (struct sock *)msk; + u8 i; + + pr_debug("%s rm_list_nr %d\n", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); + + msk_owned_by_me(msk); + + if (sk->sk_state == TCP_LISTEN) + return; + + if (!rm_list->nr) + return; + + if (list_empty(&msk->conn_list)) + return; + + for (i = 0; i < rm_list->nr; i++) { + u8 rm_id = rm_list->ids[i]; + bool removed = false; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 remote_id = READ_ONCE(subflow->remote_id); + int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + u8 id = subflow_get_local_id(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) + continue; + if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) + continue; + if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) + continue; + + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", + i, rm_id, id, remote_id, msk->mpc_endpoint_id); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, how); + removed |= subflow->request_join; + + /* the following takes care of updating the subflows counter */ + mptcp_close_ssk(sk, ssk, subflow); + spin_lock_bh(&msk->pm.lock); + + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } + + if (rm_type == MPTCP_MIB_RMADDR) { + __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (removed && mptcp_pm_is_kernel(msk)) + mptcp_pm_nl_rm_addr(msk, rm_id); + } + } +} + +static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) +{ + mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); +} + +void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) +{ + mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { @@ -321,8 +776,6 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) } } -/* path manager helpers */ - bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, @@ -402,7 +855,7 @@ out_unlock: int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { - struct mptcp_addr_info skc_local; + struct mptcp_pm_addr_entry skc_local = { 0 }; struct mptcp_addr_info msk_local; if (WARN_ON_ONCE(!msk)) @@ -412,10 +865,13 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) * addr */ mptcp_local_address((struct sock_common *)msk, &msk_local); - mptcp_local_address((struct sock_common *)skc, &skc_local); - if (mptcp_addresses_equal(&msk_local, &skc_local, false)) + mptcp_local_address((struct sock_common *)skc, &skc_local.addr); + if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false)) return 0; + skc_local.addr.id = 0; + skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_local_id(msk, &skc_local); return mptcp_pm_nl_get_local_id(msk, &skc_local); @@ -433,27 +889,41 @@ bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_is_backup(msk, &skc_local); } -int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info) -{ - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_get_addr(skb, info); - return mptcp_pm_nl_get_addr(skb, info); -} - -int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) +static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { - const struct genl_info *info = genl_info_dump(cb); - - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_dump_addr(msg, cb); - return mptcp_pm_nl_dump_addr(msg, cb); -} + struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = (struct sock *)msk; + unsigned int active_max_loss_cnt; + struct net *net = sock_net(sk); + unsigned int stale_loss_cnt; + bool slow; + + stale_loss_cnt = mptcp_stale_loss_cnt(net); + if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) + return; -int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info) -{ - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_set_flags(skb, info); - return mptcp_pm_nl_set_flags(skb, info); + /* look for another available subflow not in loss state */ + active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); + mptcp_for_each_subflow(msk, iter) { + if (iter != subflow && mptcp_subflow_active(iter) && + iter->stale_count < active_max_loss_cnt) { + /* we have some alternatives, try to mark this subflow as idle ...*/ + slow = lock_sock_fast(ssk); + if (!tcp_rtx_and_write_queues_empty(ssk)) { + subflow->stale = 1; + __mptcp_retransmit_pending_data(sk); + MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); + } + unlock_sock_fast(ssk, slow); + + /* always try to push the pending data regardless of re-injections: + * we can possibly use backup subflows now, and subflow selection + * is cheap under the msk socket lock + */ + __mptcp_push_pending(sk, 0); + return; + } + } } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) @@ -468,36 +938,44 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; - mptcp_pm_nl_subflow_chk_stale(msk, ssk); + mptcp_pm_subflows_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); } } -/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, - * otherwise allow any matching local/remote pair - */ -bool mptcp_pm_addr_families_match(const struct sock *sk, - const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *rem) +void mptcp_pm_worker(struct mptcp_sock *msk) { - bool mptcp_is_v4 = sk->sk_family == AF_INET; + struct mptcp_pm_data *pm = &msk->pm; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); - bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); + msk_owned_by_me(msk); - if (mptcp_is_v4) - return loc_is_v4 && rem_is_v4; + if (!(pm->status & MPTCP_PM_WORK_MASK)) + return; - if (ipv6_only_sock(sk)) - return !loc_is_v4 && !rem_is_v4; + spin_lock_bh(&msk->pm.lock); - return loc_is_v4 == rem_is_v4; -#else - return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; -#endif + pr_debug("msk=%p status=%x\n", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_addr_send_ack(msk); + } + if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); + mptcp_pm_rm_addr_recv(msk); + } + __mptcp_pm_kernel_worker(msk); + + spin_unlock_bh(&msk->pm.lock); +} + +void mptcp_pm_destroy(struct mptcp_sock *msk) +{ + mptcp_pm_free_anno_list(msk); + + if (mptcp_pm_is_userspace(msk)) + mptcp_userspace_pm_free_local_addr_list(msk); } void mptcp_pm_data_reset(struct mptcp_sock *msk) @@ -505,10 +983,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); struct mptcp_pm_data *pm = &msk->pm; - pm->add_addr_signaled = 0; - pm->add_addr_accepted = 0; - pm->local_addr_used = 0; - pm->subflows = 0; + memset(&pm->reset, 0, sizeof(pm->reset)); pm->rm_list_tx.nr = 0; pm->rm_list_rx.nr = 0; WRITE_ONCE(pm->pm_type, pm_type); @@ -527,16 +1002,9 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); - } else { - WRITE_ONCE(pm->work_pending, 0); - WRITE_ONCE(pm->accept_addr, 0); - WRITE_ONCE(pm->accept_subflow, 0); - } - WRITE_ONCE(pm->addr_signal, 0); - WRITE_ONCE(pm->remote_deny_join_id0, false); - pm->status = 0; - bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + } } void mptcp_pm_data_init(struct mptcp_sock *msk) @@ -549,5 +1017,75 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) void __init mptcp_pm_init(void) { + mptcp_pm_kernel_register(); + mptcp_pm_userspace_register(); mptcp_pm_nl_init(); } + +/* Must be called with rcu read lock held */ +struct mptcp_pm_ops *mptcp_pm_find(const char *name) +{ + struct mptcp_pm_ops *pm_ops; + + list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { + if (!strcmp(pm_ops->name, name)) + return pm_ops; + } + + return NULL; +} + +int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops) +{ + return 0; +} + +int mptcp_pm_register(struct mptcp_pm_ops *pm_ops) +{ + int ret; + + ret = mptcp_pm_validate(pm_ops); + if (ret) + return ret; + + spin_lock(&mptcp_pm_list_lock); + if (mptcp_pm_find(pm_ops->name)) { + spin_unlock(&mptcp_pm_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list); + spin_unlock(&mptcp_pm_list_lock); + + pr_debug("%s registered\n", pm_ops->name); + return 0; +} + +void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops) +{ + /* skip unregistering the default path manager */ + if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel)) + return; + + spin_lock(&mptcp_pm_list_lock); + list_del_rcu(&pm_ops->list); + spin_unlock(&mptcp_pm_list_lock); +} + +/* Build string with list of available path manager values. + * Similar to tcp_get_available_congestion_control() + */ +void mptcp_pm_get_available(char *buf, size_t maxlen) +{ + struct mptcp_pm_ops *pm_ops; + size_t offs = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { + offs += snprintf(buf + offs, maxlen - offs, "%s%s", + offs == 0 ? "" : " ", pm_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; + } + rcu_read_unlock(); +} diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c new file mode 100644 index 000000000000..d39e7c178460 --- /dev/null +++ b/net/mptcp/pm_kernel.c @@ -0,0 +1,1412 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2025, Matthieu Baerts. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <net/netns/generic.h> + +#include "protocol.h" +#include "mib.h" +#include "mptcp_pm_gen.h" + +static int pm_nl_pernet_id; + +struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; + struct list_head local_addr_list; + unsigned int addrs; + unsigned int stale_loss_cnt; + unsigned int add_addr_signal_max; + unsigned int add_addr_accept_max; + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); +}; + +#define MPTCP_PM_ADDR_MAX 8 + +static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) +{ + return net_generic(net, pm_nl_pernet_id); +} + +static struct pm_nl_pernet * +pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) +{ + return pm_nl_get_pernet(sock_net((struct sock *)msk)); +} + +static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) +{ + return pm_nl_get_pernet(genl_info_net(info)); +} + +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +{ + const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->add_addr_signal_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); + +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->add_addr_accept_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); + +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->subflows_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); + +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->local_addr_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); + +static bool lookup_subflow_by_daddr(const struct list_head *list, + const struct mptcp_addr_info *daddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + + list_for_each_entry(subflow, list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!((1 << inet_sk_state_load(ssk)) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) + continue; + + mptcp_remote_address((struct sock_common *)ssk, &cur); + if (mptcp_addresses_equal(&cur, daddr, daddr->port)) + return true; + } + + return false; +} + +static bool +select_local_address(const struct pm_nl_pernet *pernet, + const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) +{ + struct mptcp_pm_addr_entry *entry; + bool found = false; + + msk_owned_by_me(msk); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +static bool +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) +{ + struct mptcp_pm_addr_entry *entry; + bool found = false; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal + * the address list in order. + * Note: removal from the local address list during the msk life-cycle + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; + + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + bool fullmesh, + struct mptcp_addr_info *addrs) +{ + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info remote = { 0 }; + unsigned int subflows_max; + int i = 0; + + subflows_max = mptcp_pm_get_subflows_max(msk); + mptcp_remote_address((struct sock_common *)sk, &remote); + + /* Non-fullmesh endpoint, fill in the single entry + * corresponding to the primary MPC subflow remote address + */ + if (!fullmesh) { + if (deny_id0) + return 0; + + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; + + msk->pm.subflows++; + addrs[i++] = remote; + } else { + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + + /* Forbid creation of new subflows matching existing + * ones, possibly already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = READ_ONCE(subflow->remote_id); + if (deny_id0 && !addrs[i].id) + continue; + + if (test_bit(addrs[i].id, unavail_id)) + continue; + + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + + if (msk->pm.subflows < subflows_max) { + /* forbid creating multiple address towards + * this id + */ + __set_bit(addrs[i].id, unavail_id); + msk->pm.subflows++; + i++; + } + } + } + + return i; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { + if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) + return entry; + } + return NULL; +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + unsigned int add_addr_signal_max; + bool signal_and_subflow = false; + unsigned int local_addr_max; + struct pm_nl_pernet *pernet; + struct mptcp_pm_local local; + unsigned int subflows_max; + + pernet = pm_nl_get_pernet(sock_net(sk)); + + add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + local_addr_max = mptcp_pm_get_local_addr_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + /* do lazy endpoint usage accounting for the MPC subflows */ + if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + bool backup = false; + + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + } + rcu_read_unlock(); + + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); + } + + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, local_addr_max, + msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.subflows, subflows_max); + + /* check first for announce */ + if (msk->pm.add_addr_signaled < add_addr_signal_max) { + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + + if (!select_signal_address(pernet, msk, &local)) + goto subflow; + + /* If the alloc fails, we are on memory pressure, not worth + * continuing, and trying to create subflows. + */ + if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) + return; + + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled++; + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + + mptcp_pm_announce_addr(msk, &local.addr, false); + mptcp_pm_addr_send_ack(msk); + + if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = true; + } + +subflow: + /* check if should create a new subflow */ + while (msk->pm.local_addr_used < local_addr_max && + msk->pm.subflows < subflows_max) { + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + bool fullmesh; + int i, nr; + + if (signal_and_subflow) + signal_and_subflow = false; + else if (!select_local_address(pernet, msk, &local)) + break; + + fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + else /* local_addr_used is not decr for ID 0 */ + msk->pm.local_addr_used++; + + nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); + if (nr == 0) + continue; + + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local, &addrs[i]); + spin_lock_bh(&msk->pm.lock); + } + mptcp_pm_nl_check_work_pending(msk); +} + +static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + struct pm_nl_pernet *pernet; + unsigned int subflows_max; + int i = 0; + + pernet = pm_nl_get_pernet_from_msk(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (msk->pm.subflows < subflows_max) { + locals[i].addr = entry->addr; + locals[i].flags = entry->flags; + locals[i].ifindex = entry->ifindex; + + /* Special case for ID0: set the correct ID */ + if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) + locals[i].addr.id = 0; + + msk->pm.subflows++; + i++; + } + } + rcu_read_unlock(); + + /* If the array is empty, fill in the single + * 'IPADDRANY' local address + */ + if (!i) { + memset(&locals[i], 0, sizeof(locals[i])); + locals[i].addr.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) + return 0; + + msk->pm.subflows++; + i++; + } + + return i; +} + +static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +{ + struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; + struct sock *sk = (struct sock *)msk; + unsigned int add_addr_accept_max; + struct mptcp_addr_info remote; + unsigned int subflows_max; + bool sf_created = false; + int i, nr; + + add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + pr_debug("accepted %d:%d remote family %d\n", + msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.remote.family); + + remote = msk->pm.remote; + mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_addr_send_ack(msk); + + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) + return; + + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) + remote.port = sk->sk_dport; + + /* connect to the specified remote address, using whatever + * local address the routing configuration will pick. + */ + nr = fill_local_addresses_vec(msk, &remote, locals); + if (nr == 0) + return; + + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) + sf_created = true; + spin_lock_bh(&msk->pm.lock); + + if (sf_created) { + /* add_addr_accepted is not decr for ID 0 */ + if (remote.id) + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + } +} + +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) +{ + if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); + } +} + +static bool address_use_port(struct mptcp_pm_addr_entry *entry) +{ + return (entry->flags & + (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == + MPTCP_PM_ADDR_FLAG_SIGNAL; +} + +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); +} + +static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + struct mptcp_pm_addr_entry *entry, + bool needs_id, bool replace) +{ + struct mptcp_pm_addr_entry *cur, *del_entry = NULL; + unsigned int addr_max; + int ret = -EINVAL; + + spin_lock_bh(&pernet->lock); + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ + if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) + pernet->next_id = 1; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + ret = -ERANGE; + goto out; + } + if (test_bit(entry->addr.id, pernet->id_bitmap)) { + ret = -EBUSY; + goto out; + } + + /* do not insert duplicate address, differentiate on port only + * singled addresses + */ + if (!address_use_port(entry)) + entry->addr.port = 0; + list_for_each_entry(cur, &pernet->local_addr_list, list) { + if (mptcp_addresses_equal(&cur->addr, &entry->addr, + cur->addr.port || entry->addr.port)) { + /* allow replacing the exiting endpoint only if such + * endpoint is an implicit one and the user-space + * did not provide an endpoint id + */ + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { + ret = -EEXIST; + goto out; + } + if (entry->addr.id) + goto out; + + /* allow callers that only need to look up the local + * addr's id to skip replacement. This allows them to + * avoid calling synchronize_rcu in the packet recv + * path. + */ + if (!replace) { + kfree(entry); + ret = cur->addr.id; + goto out; + } + + pernet->addrs--; + entry->addr.id = cur->addr.id; + list_del_rcu(&cur->list); + del_entry = cur; + break; + } + } + + if (!entry->addr.id && needs_id) { +find_next: + entry->addr.id = find_next_zero_bit(pernet->id_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, + pernet->next_id); + if (!entry->addr.id && pernet->next_id != 1) { + pernet->next_id = 1; + goto find_next; + } + } + + if (!entry->addr.id && needs_id) + goto out; + + __set_bit(entry->addr.id, pernet->id_bitmap); + if (entry->addr.id > pernet->next_id) + pernet->next_id = entry->addr.id; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + } + + pernet->addrs++; + if (!entry->addr.port) + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + else + list_add_rcu(&entry->list, &pernet->local_addr_list); + ret = entry->addr.id; + +out: + spin_unlock_bh(&pernet->lock); + + /* just replaced an existing entry, free it */ + if (del_entry) { + synchronize_rcu(); + __mptcp_pm_release_addr_entry(del_entry); + } + return ret; +} + +static struct lock_class_key mptcp_slock_keys[2]; +static struct lock_class_key mptcp_keys[2]; + +static int mptcp_pm_nl_create_listen_socket(struct sock *sk, + struct mptcp_pm_addr_entry *entry) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + int addrlen = sizeof(struct sockaddr_in); + struct sockaddr_storage addr; + struct sock *newsk, *ssk; + int backlog = 1024; + int err; + + err = sock_create_kern(sock_net(sk), entry->addr.family, + SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); + if (err) + return err; + + newsk = entry->lsk->sk; + if (!newsk) + return -EINVAL; + + /* The subflow socket lock is acquired in a nested to the msk one + * in several places, even by the TCP stack, and this msk is a kernel + * socket: lockdep complains. Instead of propagating the _nested + * modifiers in several places, re-init the lock class for the msk + * socket to an mptcp specific one. + */ + sock_lock_init_class_and_name(newsk, + is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", + &mptcp_slock_keys[is_ipv6], + is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", + &mptcp_keys[is_ipv6]); + + lock_sock(newsk); + ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); + release_sock(newsk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); + + mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + if (ssk->sk_family == AF_INET) + err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ssk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#endif + if (err) + return err; + + /* We don't use mptcp_set_state() here because it needs to be called + * under the msk socket lock. For the moment, that will not bring + * anything more than only calling inet_sk_state_store(), because the + * old status is known (TCP_CLOSE). + */ + inet_sk_state_store(newsk, TCP_LISTEN); + lock_sock(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); + err = __inet_listen_sk(ssk, backlog); + if (!err) + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); + release_sock(ssk); + return err; +} + +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc) +{ + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int ret; + + pernet = pm_nl_get_pernet_from_msk(msk); + + rcu_read_lock(); + entry = __lookup_addr(pernet, &skc->addr); + ret = entry ? entry->addr.id : -1; + rcu_read_unlock(); + if (ret >= 0) + return ret; + + /* address not found, add to local list */ + entry = kmemdup(skc, sizeof(*skc), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + entry->addr.port = 0; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); + if (ret < 0) + kfree(entry); + + return ret; +} + +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + struct mptcp_pm_addr_entry *entry; + bool backup; + + rcu_read_lock(); + entry = __lookup_addr(pernet, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + rcu_read_unlock(); + + return backup; +} + +static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_sock *msk; + long s_slot = 0, s_num = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info mpc_addr; + + if (!READ_ONCE(msk->fully_established) || + mptcp_pm_is_userspace(msk)) + goto next; + + /* if the endp linked to the init sf is re-added with a != ID */ + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) + msk->mpc_endpoint_id = addr->id; + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, + struct genl_info *info) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_address_nl_policy, info->extack) && + tb[MPTCP_PM_ADDR_ATTR_ID]) + return true; + return false; +} + +/* Add an MPTCP endpoint */ +int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct nlattr *attr; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, true, &addr); + if (ret < 0) + return ret; + + if (addr.addr.port && !address_use_port(&addr)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal and not subflow when using port"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags mustn't have both signal and fullmesh"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "can't create IMPLICIT endpoint"); + return -EINVAL; + } + + entry = kmemdup(&addr, sizeof(addr), GFP_KERNEL_ACCOUNT); + if (!entry) { + GENL_SET_ERR_MSG(info, "can't allocate addr"); + return -ENOMEM; + } + + if (entry->addr.port) { + ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); + if (ret) { + GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); + goto out_free; + } + } + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, + !mptcp_pm_has_addr_attr_id(attr, info), + true); + if (ret < 0) { + GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); + goto out_free; + } + + mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); + return 0; + +out_free: + __mptcp_pm_release_addr_entry(entry); + return ret; +} + +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} + +static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr, + bool force) +{ + struct mptcp_rm_list list = { .nr = 0 }; + bool ret; + + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); + + ret = mptcp_remove_anno_list_by_saddr(msk, addr); + if (ret || force) { + spin_lock_bh(&msk->pm.lock); + if (ret) { + __set_bit(addr->id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled--; + } + mptcp_pm_remove_addr(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + return ret; +} + +static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) +{ + /* If it was marked as used, and not ID 0, decrement local_addr_used */ + if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && + id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) + msk->pm.local_addr_used--; +} + +static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, + const struct mptcp_pm_addr_entry *entry) +{ + const struct mptcp_addr_info *addr = &entry->addr; + struct mptcp_rm_list list = { .nr = 1 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + pr_debug("remove_id=%d\n", addr->id); + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + bool remove_subflow; + + if (mptcp_pm_is_userspace(msk)) + goto next; + + lock_sock(sk); + remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); + mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && + !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); + + list.ids[0] = mptcp_endp_get_local_id(msk, addr); + if (remove_subflow) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_rm_subflow(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + spin_lock_bh(&msk->pm.lock); + __mark_subflow_endp_available(msk, list.ids[0]); + spin_unlock_bh(&msk->pm.lock); + } + + if (msk->mpc_endpoint_id == entry->addr.id) + msk->mpc_endpoint_id = 0; + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +static int mptcp_nl_remove_id_zero_address(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + list.ids[list.nr++] = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info msk_local; + + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) + goto next; + + mptcp_local_address((struct sock_common *)msk, &msk_local); + if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &list); + mptcp_pm_rm_subflow(msk, &list); + __mark_subflow_endp_available(msk, 0); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +/* Remove an MPTCP endpoint */ +int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + unsigned int addr_max; + struct nlattr *attr; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + return ret; + + /* the zero id address is special: the first address used by the msk + * always gets such an id, so different subflows can have different zero + * id addresses. Additionally zero id is not accounted for in id_bitmap. + * Let's use an 'mptcp_rm_list' instead of the common remove code. + */ + if (addr.addr.id == 0) + return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + spin_unlock_bh(&pernet->lock); + return -EINVAL; + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + } + + pernet->addrs--; + list_del_rcu(&entry->list); + __clear_bit(entry->addr.id, pernet->id_bitmap); + spin_unlock_bh(&pernet->lock); + + mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); + synchronize_rcu(); + __mptcp_pm_release_addr_entry(entry); + + return ret; +} + +static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, rm_list, list) { + if (slist.nr < MPTCP_RM_IDS_MAX && + mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); + + if (alist.nr < MPTCP_RM_IDS_MAX && + mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); + } + + spin_lock_bh(&msk->pm.lock); + if (alist.nr) { + msk->pm.add_addr_signaled -= alist.nr; + mptcp_pm_remove_addr(msk, &alist); + } + if (slist.nr) + mptcp_pm_rm_subflow(msk, &slist); + /* Reset counters: maybe some subflows have been removed before */ + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + msk->pm.local_addr_used = 0; + spin_unlock_bh(&msk->pm.lock); +} + +static void mptcp_nl_flush_addrs_list(struct net *net, + struct list_head *rm_list) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (list_empty(rm_list)) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (!mptcp_pm_is_userspace(msk)) { + lock_sock(sk); + mptcp_pm_flush_addrs_and_subflows(msk, rm_list); + release_sock(sk); + } + + sock_put(sk); + cond_resched(); + } +} + +/* caller must ensure the RCU grace period is already elapsed */ +static void __flush_addrs(struct list_head *list) +{ + while (!list_empty(list)) { + struct mptcp_pm_addr_entry *cur; + + cur = list_entry(list->next, + struct mptcp_pm_addr_entry, list); + list_del_rcu(&cur->list); + __mptcp_pm_release_addr_entry(cur); + } +} + +static void __reset_counters(struct pm_nl_pernet *pernet) +{ + WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->add_addr_accept_max, 0); + WRITE_ONCE(pernet->local_addr_max, 0); + pernet->addrs = 0; +} + +int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); + + spin_lock_bh(&pernet->lock); + list_splice_init(&pernet->local_addr_list, &free_list); + __reset_counters(pernet); + pernet->next_id = 1; + bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + spin_unlock_bh(&pernet->lock); + mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); + synchronize_rcu(); + __flush_addrs(&free_list); + return 0; +} + +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry *entry; + int ret = -EINVAL; + + rcu_read_lock(); + entry = __lookup_addr_by_id(pernet, id); + if (entry) { + *addr = *entry; + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + +int mptcp_pm_nl_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int id = cb->args[0]; + int i; + + pernet = pm_nl_get_pernet(net); + + rcu_read_lock(); + for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { + if (test_bit(i, pernet->id_bitmap)) { + entry = __lookup_addr_by_id(pernet, i); + if (!entry) + break; + + if (entry->addr.id <= id) + continue; + + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) + break; + + id = entry->addr.id; + } + } + rcu_read_unlock(); + + cb->args[0] = id; + return msg->len; +} + +static int parse_limit(struct genl_info *info, int id, unsigned int *limit) +{ + struct nlattr *attr = info->attrs[id]; + + if (!attr) + return 0; + + *limit = nla_get_u32(attr); + if (*limit > MPTCP_PM_ADDR_MAX) { + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "limit greater than maximum (%u)", + MPTCP_PM_ADDR_MAX); + return -EINVAL; + } + return 0; +} + +int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + unsigned int rcv_addrs, subflows; + int ret; + + spin_lock_bh(&pernet->lock); + rcv_addrs = pernet->add_addr_accept_max; + ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); + if (ret) + goto unlock; + + subflows = pernet->subflows_max; + ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); + if (ret) + goto unlock; + + WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->subflows_max, subflows); + +unlock: + spin_unlock_bh(&pernet->lock); + return ret; +} + +int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct sk_buff *msg; + void *reply; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + MPTCP_PM_CMD_GET_LIMITS); + if (!reply) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + READ_ONCE(pernet->add_addr_accept_max))) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, + READ_ONCE(pernet->subflows_max))) + goto fail; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +fail: + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + nlmsg_free(msg); + return -EMSGSIZE; +} + +static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); + + spin_lock_bh(&msk->pm.lock); + mptcp_pm_rm_subflow(msk, &list); + __mark_subflow_endp_available(msk, list.ids[0]); + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); +} + +static void mptcp_pm_nl_set_flags_all(struct net *net, + struct mptcp_pm_addr_entry *local, + u8 changed) +{ + u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); + u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) + goto next; + + lock_sock(sk); + if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) + mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup); + /* Subflows will only be recreated if the SUBFLOW flag is set */ + if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) + mptcp_pm_nl_fullmesh(msk, &local->addr); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } +} + +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | + MPTCP_PM_ADDR_FLAG_FULLMESH; + struct net *net = genl_info_net(info); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + u8 lookup_by_id = 0; + + pernet = pm_nl_get_pernet(net); + + if (local->addr.family == AF_UNSPEC) { + lookup_by_id = 1; + if (!local->addr.id) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address ID"); + return -EOPNOTSUPP; + } + } + + spin_lock_bh(&pernet->lock); + entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) : + __lookup_addr(pernet, &local->addr); + if (!entry) { + spin_unlock_bh(&pernet->lock); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + return -EINVAL; + } + if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | + MPTCP_PM_ADDR_FLAG_IMPLICIT))) { + spin_unlock_bh(&pernet->lock); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags"); + return -EINVAL; + } + + changed = (local->flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (local->flags & mask); + *local = *entry; + spin_unlock_bh(&pernet->lock); + + mptcp_pm_nl_set_flags_all(net, local, changed); + return 0; +} + +bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { + WRITE_ONCE(msk->pm.work_pending, false); + return false; + } + return true; +} + +/* Called under PM lock */ +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } +} + +static int __net_init pm_nl_init_net(struct net *net) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + + INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + + /* Cit. 2 subflows ought to be enough for anybody. */ + pernet->subflows_max = 2; + pernet->next_id = 1; + pernet->stale_loss_cnt = 4; + spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + + return 0; +} + +static void __net_exit pm_nl_exit_net(struct list_head *net_list) +{ + struct net *net; + + list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + + /* net is removed from namespace list, can't race with + * other modifiers, also netns core already waited for a + * RCU grace period. + */ + __flush_addrs(&pernet->local_addr_list); + } +} + +static struct pernet_operations mptcp_pm_pernet_ops = { + .init = pm_nl_init_net, + .exit_batch = pm_nl_exit_net, + .id = &pm_nl_pernet_id, + .size = sizeof(struct pm_nl_pernet), +}; + +struct mptcp_pm_ops mptcp_pm_kernel = { + .name = "kernel", + .owner = THIS_MODULE, +}; + +void __init mptcp_pm_kernel_register(void) +{ + if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) + panic("Failed to register MPTCP PM pernet subsystem.\n"); + + mptcp_pm_register(&mptcp_pm_kernel); +} diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7868207c4e9d..50aaf259959a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -6,1197 +6,9 @@ #define pr_fmt(fmt) "MPTCP: " fmt -#include <linux/inet.h> -#include <linux/kernel.h> -#include <net/inet_common.h> -#include <net/netns/generic.h> -#include <net/mptcp.h> - #include "protocol.h" -#include "mib.h" #include "mptcp_pm_gen.h" -static int pm_nl_pernet_id; - -struct mptcp_pm_add_entry { - struct list_head list; - struct mptcp_addr_info addr; - u8 retrans_times; - struct timer_list add_timer; - struct mptcp_sock *sock; -}; - -struct pm_nl_pernet { - /* protects pernet updates */ - spinlock_t lock; - struct list_head local_addr_list; - unsigned int addrs; - unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; - unsigned int add_addr_accept_max; - unsigned int local_addr_max; - unsigned int subflows_max; - unsigned int next_id; - DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); -}; - -#define MPTCP_PM_ADDR_MAX 8 -#define ADD_ADDR_RETRANS_MAX 3 - -static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) -{ - return net_generic(net, pm_nl_pernet_id); -} - -static struct pm_nl_pernet * -pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) -{ - return pm_nl_get_pernet(sock_net((struct sock *)msk)); -} - -bool mptcp_addresses_equal(const struct mptcp_addr_info *a, - const struct mptcp_addr_info *b, bool use_port) -{ - bool addr_equals = false; - - if (a->family == b->family) { - if (a->family == AF_INET) - addr_equals = a->addr.s_addr == b->addr.s_addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else - addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); - } else if (a->family == AF_INET) { - if (ipv6_addr_v4mapped(&b->addr6)) - addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; - } else if (b->family == AF_INET) { - if (ipv6_addr_v4mapped(&a->addr6)) - addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; -#endif - } - - if (!addr_equals) - return false; - if (!use_port) - return true; - - return a->port == b->port; -} - -void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) -{ - addr->family = skc->skc_family; - addr->port = htons(skc->skc_num); - if (addr->family == AF_INET) - addr->addr.s_addr = skc->skc_rcv_saddr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (addr->family == AF_INET6) - addr->addr6 = skc->skc_v6_rcv_saddr; -#endif -} - -static void remote_address(const struct sock_common *skc, - struct mptcp_addr_info *addr) -{ - addr->family = skc->skc_family; - addr->port = skc->skc_dport; - if (addr->family == AF_INET) - addr->addr.s_addr = skc->skc_daddr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (addr->family == AF_INET6) - addr->addr6 = skc->skc_v6_daddr; -#endif -} - -bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, - const struct mptcp_addr_info *saddr) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info cur; - struct sock_common *skc; - - list_for_each_entry(subflow, list, node) { - skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); - - mptcp_local_address(skc, &cur); - if (mptcp_addresses_equal(&cur, saddr, saddr->port)) - return true; - } - - return false; -} - -static bool lookup_subflow_by_daddr(const struct list_head *list, - const struct mptcp_addr_info *daddr) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info cur; - - list_for_each_entry(subflow, list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - if (!((1 << inet_sk_state_load(ssk)) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) - continue; - - remote_address((struct sock_common *)ssk, &cur); - if (mptcp_addresses_equal(&cur, daddr, daddr->port)) - return true; - } - - return false; -} - -static bool -select_local_address(const struct pm_nl_pernet *pernet, - const struct mptcp_sock *msk, - struct mptcp_pm_local *new_local) -{ - struct mptcp_pm_addr_entry *entry; - bool found = false; - - msk_owned_by_me(msk); - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) - continue; - - if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - continue; - - new_local->addr = entry->addr; - new_local->flags = entry->flags; - new_local->ifindex = entry->ifindex; - found = true; - break; - } - rcu_read_unlock(); - - return found; -} - -static bool -select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, - struct mptcp_pm_local *new_local) -{ - struct mptcp_pm_addr_entry *entry; - bool found = false; - - rcu_read_lock(); - /* do not keep any additional per socket state, just signal - * the address list in order. - * Note: removal from the local address list during the msk life-cycle - * can lead to additional addresses not being announced. - */ - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - continue; - - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) - continue; - - new_local->addr = entry->addr; - new_local->flags = entry->flags; - new_local->ifindex = entry->ifindex; - found = true; - break; - } - rcu_read_unlock(); - - return found; -} - -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) -{ - const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->add_addr_signal_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); - -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->add_addr_accept_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); - -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->subflows_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); - -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - -bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || - (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, - MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { - WRITE_ONCE(msk->pm.work_pending, false); - return false; - } - return true; -} - -struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *entry; - - lockdep_assert_held(&msk->pm.lock); - - list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (mptcp_addresses_equal(&entry->addr, addr, true)) - return entry; - } - - return NULL; -} - -bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) -{ - struct mptcp_pm_add_entry *entry; - struct mptcp_addr_info saddr; - bool ret = false; - - mptcp_local_address((struct sock_common *)sk, &saddr); - - spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { - ret = true; - goto out; - } - } - -out: - spin_unlock_bh(&msk->pm.lock); - return ret; -} - -static void mptcp_pm_add_timer(struct timer_list *timer) -{ - struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); - struct mptcp_sock *msk = entry->sock; - struct sock *sk = (struct sock *)msk; - - pr_debug("msk=%p\n", msk); - - if (!msk) - return; - - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - if (!entry->addr.id) - return; - - if (mptcp_pm_should_add_signal_addr(msk)) { - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); - goto out; - } - - spin_lock_bh(&msk->pm.lock); - - if (!mptcp_pm_should_add_signal_addr(msk)) { - pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false); - mptcp_pm_add_addr_send_ack(msk); - entry->retrans_times++; - } - - if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); - - spin_unlock_bh(&msk->pm.lock); - - if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) - mptcp_pm_subflow_established(msk); - -out: - __sock_put(sk); -} - -struct mptcp_pm_add_entry * -mptcp_pm_del_add_timer(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr, bool check_id) -{ - struct mptcp_pm_add_entry *entry; - struct sock *sk = (struct sock *)msk; - struct timer_list *add_timer = NULL; - - spin_lock_bh(&msk->pm.lock); - entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - if (entry && (!check_id || entry->addr.id == addr->id)) { - entry->retrans_times = ADD_ADDR_RETRANS_MAX; - add_timer = &entry->add_timer; - } - if (!check_id && entry) - list_del(&entry->list); - spin_unlock_bh(&msk->pm.lock); - - /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ - if (add_timer) - sk_stop_timer_sync(sk, add_timer); - - return entry; -} - -bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *add_entry = NULL; - struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); - - lockdep_assert_held(&msk->pm.lock); - - add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - - if (add_entry) { - if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) - return false; - - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; - } - - add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); - if (!add_entry) - return false; - - list_add(&add_entry->list, &msk->pm.anno_list); - - add_entry->addr = *addr; - add_entry->sock = msk; - add_entry->retrans_times = 0; - - timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - - return true; -} - -void mptcp_pm_free_anno_list(struct mptcp_sock *msk) -{ - struct mptcp_pm_add_entry *entry, *tmp; - struct sock *sk = (struct sock *)msk; - LIST_HEAD(free_list); - - pr_debug("msk=%p\n", msk); - - spin_lock_bh(&msk->pm.lock); - list_splice_init(&msk->pm.anno_list, &free_list); - spin_unlock_bh(&msk->pm.lock); - - list_for_each_entry_safe(entry, tmp, &free_list, list) { - sk_stop_timer_sync(sk, &entry->add_timer); - kfree(entry); - } -} - -/* Fill all the remote addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *local, - bool fullmesh, - struct mptcp_addr_info *addrs) -{ - bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); - struct sock *sk = (struct sock *)msk, *ssk; - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info remote = { 0 }; - unsigned int subflows_max; - int i = 0; - - subflows_max = mptcp_pm_get_subflows_max(msk); - remote_address((struct sock_common *)sk, &remote); - - /* Non-fullmesh endpoint, fill in the single entry - * corresponding to the primary MPC subflow remote address - */ - if (!fullmesh) { - if (deny_id0) - return 0; - - if (!mptcp_pm_addr_families_match(sk, local, &remote)) - return 0; - - msk->pm.subflows++; - addrs[i++] = remote; - } else { - DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - - /* Forbid creation of new subflows matching existing - * ones, possibly already created by incoming ADD_ADDR - */ - bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - mptcp_for_each_subflow(msk, subflow) - if (READ_ONCE(subflow->local_id) == local->id) - __set_bit(subflow->remote_id, unavail_id); - - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = READ_ONCE(subflow->remote_id); - if (deny_id0 && !addrs[i].id) - continue; - - if (test_bit(addrs[i].id, unavail_id)) - continue; - - if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) - continue; - - if (msk->pm.subflows < subflows_max) { - /* forbid creating multiple address towards - * this id - */ - __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; - i++; - } - } - } - - return i; -} - -static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - bool prio, bool backup) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; - - pr_debug("send ack for %s\n", - prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); - - slow = lock_sock_fast(ssk); - if (prio) { - subflow->send_mp_prio = 1; - subflow->request_bkup = backup; - } - - __mptcp_subflow_send_ack(ssk); - unlock_sock_fast(ssk, slow); -} - -static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - bool prio, bool backup) -{ - spin_unlock_bh(&msk->pm.lock); - __mptcp_pm_send_ack(msk, subflow, prio, backup); - spin_lock_bh(&msk->pm.lock); -} - -static struct mptcp_pm_addr_entry * -__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, - lockdep_is_held(&pernet->lock)) { - if (entry->addr.id == id) - return entry; - } - return NULL; -} - -static struct mptcp_pm_addr_entry * -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, - lockdep_is_held(&pernet->lock)) { - if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) - return entry; - } - return NULL; -} - -static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_signal_max; - bool signal_and_subflow = false; - unsigned int local_addr_max; - struct pm_nl_pernet *pernet; - struct mptcp_pm_local local; - unsigned int subflows_max; - - pernet = pm_nl_get_pernet(sock_net(sk)); - - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - /* do lazy endpoint usage accounting for the MPC subflows */ - if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - bool backup = false; - - mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); - rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr); - if (entry) { - __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); - msk->mpc_endpoint_id = entry->addr.id; - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - } - rcu_read_unlock(); - - if (backup) - mptcp_pm_send_ack(msk, subflow, true, backup); - - msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); - } - - pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); - - /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { - /* due to racing events on both ends we can reach here while - * previous add address is still running: if we invoke now - * mptcp_pm_announce_addr(), that will fail and the - * corresponding id will be marked as used. - * Instead let the PM machinery reschedule us when the - * current address announce will be completed. - */ - if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) - return; - - if (!select_signal_address(pernet, msk, &local)) - goto subflow; - - /* If the alloc fails, we are on memory pressure, not worth - * continuing, and trying to create subflows. - */ - if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) - return; - - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled++; - - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - - mptcp_pm_announce_addr(msk, &local.addr, false); - mptcp_pm_nl_addr_send_ack(msk); - - if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - signal_and_subflow = true; - } - -subflow: - /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { - struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; - bool fullmesh; - int i, nr; - - if (signal_and_subflow) - signal_and_subflow = false; - else if (!select_local_address(pernet, msk, &local)) - break; - - fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); - - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - else /* local_addr_used is not decr for ID 0 */ - msk->pm.local_addr_used++; - - nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); - if (nr == 0) - continue; - - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &local, &addrs[i]); - spin_lock_bh(&msk->pm.lock); - } - mptcp_pm_nl_check_work_pending(msk); -} - -static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) -{ - mptcp_pm_create_subflow_or_signal_addr(msk); -} - -static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) -{ - mptcp_pm_create_subflow_or_signal_addr(msk); -} - -/* Fill all the local addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *remote, - struct mptcp_pm_local *locals) -{ - struct sock *sk = (struct sock *)msk; - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - struct pm_nl_pernet *pernet; - unsigned int subflows_max; - int i = 0; - - pernet = pm_nl_get_pernet_from_msk(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) - continue; - - if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) - continue; - - if (msk->pm.subflows < subflows_max) { - locals[i].addr = entry->addr; - locals[i].flags = entry->flags; - locals[i].ifindex = entry->ifindex; - - /* Special case for ID0: set the correct ID */ - if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) - locals[i].addr.id = 0; - - msk->pm.subflows++; - i++; - } - } - rcu_read_unlock(); - - /* If the array is empty, fill in the single - * 'IPADDRANY' local address - */ - if (!i) { - memset(&locals[i], 0, sizeof(locals[i])); - locals[i].addr.family = -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - remote->family == AF_INET6 && - ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : -#endif - remote->family; - - if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) - return 0; - - msk->pm.subflows++; - i++; - } - - return i; -} - -static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) -{ - struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_accept_max; - struct mptcp_addr_info remote; - unsigned int subflows_max; - bool sf_created = false; - int i, nr; - - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, - msk->pm.remote.family); - - remote = msk->pm.remote; - mptcp_pm_announce_addr(msk, &remote, true); - mptcp_pm_nl_addr_send_ack(msk); - - if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) - return; - - /* pick id 0 port, if none is provided the remote address */ - if (!remote.port) - remote.port = sk->sk_dport; - - /* connect to the specified remote address, using whatever - * local address the routing configuration will pick. - */ - nr = fill_local_addresses_vec(msk, &remote, locals); - if (nr == 0) - return; - - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) - sf_created = true; - spin_lock_bh(&msk->pm.lock); - - if (sf_created) { - /* add_addr_accepted is not decr for ID 0 */ - if (remote.id) - msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - } -} - -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote) -{ - struct mptcp_addr_info mpc_remote; - - remote_address((struct sock_common *)msk, &mpc_remote); - return mptcp_addresses_equal(&mpc_remote, remote, remote->port); -} - -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *alt = NULL; - - msk_owned_by_me(msk); - lockdep_assert_held(&msk->pm.lock); - - if (!mptcp_pm_should_add_signal(msk) && - !mptcp_pm_should_rm_signal(msk)) - return; - - mptcp_for_each_subflow(msk, subflow) { - if (__mptcp_subflow_active(subflow)) { - if (!subflow->stale) { - mptcp_pm_send_ack(msk, subflow, false, false); - return; - } - - if (!alt) - alt = subflow; - } - } - - if (alt) - mptcp_pm_send_ack(msk, alt, false, false); -} - -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup) -{ - struct mptcp_subflow_context *subflow; - - pr_debug("bkup=%d\n", bkup); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct mptcp_addr_info local, remote; - - mptcp_local_address((struct sock_common *)ssk, &local); - if (!mptcp_addresses_equal(&local, addr, addr->port)) - continue; - - if (rem && rem->family != AF_UNSPEC) { - remote_address((struct sock_common *)ssk, &remote); - if (!mptcp_addresses_equal(&remote, rem, rem->port)) - continue; - } - - __mptcp_pm_send_ack(msk, subflow, true, bkup); - return 0; - } - - return -EINVAL; -} - -static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list, - enum linux_mptcp_mib_field rm_type) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - u8 i; - - pr_debug("%s rm_list_nr %d\n", - rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); - - msk_owned_by_me(msk); - - if (sk->sk_state == TCP_LISTEN) - return; - - if (!rm_list->nr) - return; - - if (list_empty(&msk->conn_list)) - return; - - for (i = 0; i < rm_list->nr; i++) { - u8 rm_id = rm_list->ids[i]; - bool removed = false; - - mptcp_for_each_subflow_safe(msk, subflow, tmp) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - u8 remote_id = READ_ONCE(subflow->remote_id); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - u8 id = subflow_get_local_id(subflow); - - if ((1 << inet_sk_state_load(ssk)) & - (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) - continue; - if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) - continue; - if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) - continue; - - pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", - rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, id, remote_id, msk->mpc_endpoint_id); - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - removed |= subflow->request_join; - - /* the following takes care of updating the subflows counter */ - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - if (rm_type == MPTCP_MIB_RMSUBFLOW) - __MPTCP_INC_STATS(sock_net(sk), rm_type); - } - - if (rm_type == MPTCP_MIB_RMADDR) - __MPTCP_INC_STATS(sock_net(sk), rm_type); - - if (!removed) - continue; - - if (!mptcp_pm_is_kernel(msk)) - continue; - - if (rm_type == MPTCP_MIB_RMADDR && rm_id && - !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { - /* Note: if the subflow has been closed before, this - * add_addr_accepted counter will not be decremented. - */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) - WRITE_ONCE(msk->pm.accept_addr, true); - } - } -} - -static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) -{ - mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); -} - -static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) -{ - mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); -} - -void mptcp_pm_nl_work(struct mptcp_sock *msk) -{ - struct mptcp_pm_data *pm = &msk->pm; - - msk_owned_by_me(msk); - - if (!(pm->status & MPTCP_PM_WORK_MASK)) - return; - - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x\n", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_addr_send_ack(msk); - } - if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); -} - -static bool address_use_port(struct mptcp_pm_addr_entry *entry) -{ - return (entry->flags & - (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == - MPTCP_PM_ADDR_FLAG_SIGNAL; -} - -/* caller must ensure the RCU grace period is already elapsed */ -static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); -} - -static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, - struct mptcp_pm_addr_entry *entry, - bool needs_id, bool replace) -{ - struct mptcp_pm_addr_entry *cur, *del_entry = NULL; - unsigned int addr_max; - int ret = -EINVAL; - - spin_lock_bh(&pernet->lock); - /* to keep the code simple, don't do IDR-like allocation for address ID, - * just bail when we exceed limits - */ - if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) - pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { - ret = -ERANGE; - goto out; - } - if (test_bit(entry->addr.id, pernet->id_bitmap)) { - ret = -EBUSY; - goto out; - } - - /* do not insert duplicate address, differentiate on port only - * singled addresses - */ - if (!address_use_port(entry)) - entry->addr.port = 0; - list_for_each_entry(cur, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&cur->addr, &entry->addr, - cur->addr.port || entry->addr.port)) { - /* allow replacing the exiting endpoint only if such - * endpoint is an implicit one and the user-space - * did not provide an endpoint id - */ - if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { - ret = -EEXIST; - goto out; - } - if (entry->addr.id) - goto out; - - /* allow callers that only need to look up the local - * addr's id to skip replacement. This allows them to - * avoid calling synchronize_rcu in the packet recv - * path. - */ - if (!replace) { - kfree(entry); - ret = cur->addr.id; - goto out; - } - - pernet->addrs--; - entry->addr.id = cur->addr.id; - list_del_rcu(&cur->list); - del_entry = cur; - break; - } - } - - if (!entry->addr.id && needs_id) { -find_next: - entry->addr.id = find_next_zero_bit(pernet->id_bitmap, - MPTCP_PM_MAX_ADDR_ID + 1, - pernet->next_id); - if (!entry->addr.id && pernet->next_id != 1) { - pernet->next_id = 1; - goto find_next; - } - } - - if (!entry->addr.id && needs_id) - goto out; - - __set_bit(entry->addr.id, pernet->id_bitmap); - if (entry->addr.id > pernet->next_id) - pernet->next_id = entry->addr.id; - - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); - } - - pernet->addrs++; - if (!entry->addr.port) - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); - else - list_add_rcu(&entry->list, &pernet->local_addr_list); - ret = entry->addr.id; - -out: - spin_unlock_bh(&pernet->lock); - - /* just replaced an existing entry, free it */ - if (del_entry) { - synchronize_rcu(); - __mptcp_pm_release_addr_entry(del_entry); - } - return ret; -} - -static struct lock_class_key mptcp_slock_keys[2]; -static struct lock_class_key mptcp_keys[2]; - -static int mptcp_pm_nl_create_listen_socket(struct sock *sk, - struct mptcp_pm_addr_entry *entry) -{ - bool is_ipv6 = sk->sk_family == AF_INET6; - int addrlen = sizeof(struct sockaddr_in); - struct sockaddr_storage addr; - struct sock *newsk, *ssk; - int backlog = 1024; - int err; - - err = sock_create_kern(sock_net(sk), entry->addr.family, - SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); - if (err) - return err; - - newsk = entry->lsk->sk; - if (!newsk) - return -EINVAL; - - /* The subflow socket lock is acquired in a nested to the msk one - * in several places, even by the TCP stack, and this msk is a kernel - * socket: lockdep complains. Instead of propagating the _nested - * modifiers in several places, re-init the lock class for the msk - * socket to an mptcp specific one. - */ - sock_lock_init_class_and_name(newsk, - is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", - &mptcp_slock_keys[is_ipv6], - is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", - &mptcp_keys[is_ipv6]); - - lock_sock(newsk); - ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); - release_sock(newsk); - if (IS_ERR(ssk)) - return PTR_ERR(ssk); - - mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (entry->addr.family == AF_INET6) - addrlen = sizeof(struct sockaddr_in6); -#endif - if (ssk->sk_family == AF_INET) - err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (ssk->sk_family == AF_INET6) - err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); -#endif - if (err) - return err; - - /* We don't use mptcp_set_state() here because it needs to be called - * under the msk socket lock. For the moment, that will not bring - * anything more than only calling inet_sk_state_store(), because the - * old status is known (TCP_CLOSE). - */ - inet_sk_state_store(newsk, TCP_LISTEN); - lock_sock(ssk); - WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); - err = __inet_listen_sk(ssk, backlog); - if (!err) - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); - release_sock(ssk); - return err; -} - -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) -{ - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - int ret; - - pernet = pm_nl_get_pernet_from_msk(msk); - - rcu_read_lock(); - entry = __lookup_addr(pernet, skc); - ret = entry ? entry->addr.id : -1; - rcu_read_unlock(); - if (ret >= 0) - return ret; - - /* address not found, add to local list */ - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) - return -ENOMEM; - - entry->addr = *skc; - entry->addr.id = 0; - entry->addr.port = 0; - entry->ifindex = 0; - entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - entry->lsk = NULL; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); - if (ret < 0) - kfree(entry); - - return ret; -} - -bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - struct mptcp_pm_addr_entry *entry; - bool backup; - - rcu_read_lock(); - entry = __lookup_addr(pernet, skc); - backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - rcu_read_unlock(); - - return backup; -} - #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 @@ -1207,43 +19,6 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = { }, }; -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) -{ - struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); - struct sock *sk = (struct sock *)msk; - unsigned int active_max_loss_cnt; - struct net *net = sock_net(sk); - unsigned int stale_loss_cnt; - bool slow; - - stale_loss_cnt = mptcp_stale_loss_cnt(net); - if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) - return; - - /* look for another available subflow not in loss state */ - active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); - mptcp_for_each_subflow(msk, iter) { - if (iter != subflow && mptcp_subflow_active(iter) && - iter->stale_count < active_max_loss_cnt) { - /* we have some alternatives, try to mark this subflow as idle ...*/ - slow = lock_sock_fast(ssk); - if (!tcp_rtx_and_write_queues_empty(ssk)) { - subflow->stale = 1; - __mptcp_retransmit_pending_data(sk); - MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); - } - unlock_sock_fast(ssk, slow); - - /* always try to push the pending data regardless of re-injections: - * we can possibly use backup subflows now, and subflow selection - * is cheap under the msk socket lock - */ - __mptcp_push_pending(sk, 0); - return; - } - } -} - static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1352,386 +127,8 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return 0; } -static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) -{ - return pm_nl_get_pernet(genl_info_net(info)); -} - -static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, - struct mptcp_addr_info *addr) -{ - struct mptcp_sock *msk; - long s_slot = 0, s_num = 0; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info mpc_addr; - - if (!READ_ONCE(msk->fully_established) || - mptcp_pm_is_userspace(msk)) - goto next; - - /* if the endp linked to the init sf is re-added with a != ID */ - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) - msk->mpc_endpoint_id = addr->id; - mptcp_pm_create_subflow_or_signal_addr(msk); - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, - struct genl_info *info) -{ - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; - - if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, - mptcp_pm_address_nl_policy, info->extack) && - tb[MPTCP_PM_ADDR_ATTR_ID]) - return true; - return false; -} - -int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - int ret; - - ret = mptcp_pm_parse_entry(attr, info, true, &addr); - if (ret < 0) - return ret; - - if (addr.addr.port && !address_use_port(&addr)) { - GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); - return -EINVAL; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && - addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { - GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); - return -EINVAL; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { - GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); - return -EINVAL; - } - - entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); - if (!entry) { - GENL_SET_ERR_MSG(info, "can't allocate addr"); - return -ENOMEM; - } - - *entry = addr; - if (entry->addr.port) { - ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); - if (ret) { - GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); - goto out_free; - } - } - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, - !mptcp_pm_has_addr_attr_id(attr, info), - true); - if (ret < 0) { - GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); - goto out_free; - } - - mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); - return 0; - -out_free: - __mptcp_pm_release_addr_entry(entry); - return ret; -} - -bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *entry; - - entry = mptcp_pm_del_add_timer(msk, addr, false); - if (entry) { - kfree(entry); - return true; - } - - return false; -} - -static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; -} - -static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr, - bool force) -{ - struct mptcp_rm_list list = { .nr = 0 }; - bool ret; - - list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - - ret = mptcp_remove_anno_list_by_saddr(msk, addr); - if (ret || force) { - spin_lock_bh(&msk->pm.lock); - if (ret) { - __set_bit(addr->id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled--; - } - mptcp_pm_remove_addr(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - return ret; -} - -static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) -{ - /* If it was marked as used, and not ID 0, decrement local_addr_used */ - if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && - id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) - msk->pm.local_addr_used--; -} - -static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, - const struct mptcp_pm_addr_entry *entry) -{ - const struct mptcp_addr_info *addr = &entry->addr; - struct mptcp_rm_list list = { .nr = 1 }; - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - pr_debug("remove_id=%d\n", addr->id); - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - bool remove_subflow; - - if (mptcp_pm_is_userspace(msk)) - goto next; - - lock_sock(sk); - remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); - mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && - !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); - - list.ids[0] = mptcp_endp_get_local_id(msk, addr); - if (remove_subflow) { - spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - spin_lock_bh(&msk->pm.lock); - __mark_subflow_endp_available(msk, list.ids[0]); - spin_unlock_bh(&msk->pm.lock); - } - - if (msk->mpc_endpoint_id == entry->addr.id) - msk->mpc_endpoint_id = 0; - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -static int mptcp_nl_remove_id_zero_address(struct net *net, - struct mptcp_addr_info *addr) -{ - struct mptcp_rm_list list = { .nr = 0 }; - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - list.ids[list.nr++] = 0; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info msk_local; - - if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) - goto next; - - mptcp_local_address((struct sock_common *)msk, &msk_local); - if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) - goto next; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - mptcp_pm_remove_addr(msk, &list); - mptcp_pm_nl_rm_subflow_received(msk, &list); - __mark_subflow_endp_available(msk, 0); - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - unsigned int addr_max; - int ret; - - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - /* the zero id address is special: the first address used by the msk - * always gets such an id, so different subflows can have different zero - * id addresses. Additionally zero id is not accounted for in id_bitmap. - * Let's use an 'mptcp_rm_list' instead of the common remove code. - */ - if (addr.addr.id == 0) - return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); - - spin_lock_bh(&pernet->lock); - entry = __lookup_addr_by_id(pernet, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - spin_unlock_bh(&pernet->lock); - return -EINVAL; - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); - } - - pernet->addrs--; - list_del_rcu(&entry->list); - __clear_bit(entry->addr.id, pernet->id_bitmap); - spin_unlock_bh(&pernet->lock); - - mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); - synchronize_rcu(); - __mptcp_pm_release_addr_entry(entry); - - return ret; -} - -static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) -{ - struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry(entry, rm_list, list) { - if (slist.nr < MPTCP_RM_IDS_MAX && - mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) - slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); - - if (alist.nr < MPTCP_RM_IDS_MAX && - mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) - alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); - } - - spin_lock_bh(&msk->pm.lock); - if (alist.nr) { - msk->pm.add_addr_signaled -= alist.nr; - mptcp_pm_remove_addr(msk, &alist); - } - if (slist.nr) - mptcp_pm_nl_rm_subflow_received(msk, &slist); - /* Reset counters: maybe some subflows have been removed before */ - bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - msk->pm.local_addr_used = 0; - spin_unlock_bh(&msk->pm.lock); -} - -static void mptcp_nl_flush_addrs_list(struct net *net, - struct list_head *rm_list) -{ - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - if (list_empty(rm_list)) - return; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - - if (!mptcp_pm_is_userspace(msk)) { - lock_sock(sk); - mptcp_pm_flush_addrs_and_subflows(msk, rm_list); - release_sock(sk); - } - - sock_put(sk); - cond_resched(); - } -} - -/* caller must ensure the RCU grace period is already elapsed */ -static void __flush_addrs(struct list_head *list) -{ - while (!list_empty(list)) { - struct mptcp_pm_addr_entry *cur; - - cur = list_entry(list->next, - struct mptcp_pm_addr_entry, list); - list_del_rcu(&cur->list); - __mptcp_pm_release_addr_entry(cur); - } -} - -static void __reset_counters(struct pm_nl_pernet *pernet) -{ - WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); - pernet->addrs = 0; -} - -int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - LIST_HEAD(free_list); - - spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->local_addr_list, &free_list); - __reset_counters(pernet); - pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - spin_unlock_bh(&pernet->lock); - mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); - synchronize_rcu(); - __flush_addrs(&free_list); - return 0; -} - -int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry) +static int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; @@ -1769,15 +166,26 @@ nla_put_failure: return -EMSGSIZE; } -int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) +static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_get_addr(id, addr, info); + return mptcp_pm_nl_get_addr(id, addr, info); +} + +int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct mptcp_pm_addr_entry addr; + struct nlattr *attr; struct sk_buff *msg; void *reply; int ret; + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1794,258 +202,83 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) goto fail; } - rcu_read_lock(); - entry = __lookup_addr_by_id(pernet, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - ret = -EINVAL; - goto unlock_fail; + ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + goto fail; } - ret = mptcp_nl_fill_addr(msg, entry); + ret = mptcp_nl_fill_addr(msg, &addr); if (ret) - goto unlock_fail; + goto fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); - rcu_read_unlock(); return ret; -unlock_fail: - rcu_read_unlock(); - fail: nlmsg_free(msg); return ret; } -int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry) { - return mptcp_pm_get_addr(skb, info); -} - -int mptcp_pm_nl_dump_addr(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct net *net = sock_net(msg->sk); - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - int id = cb->args[0]; void *hdr; - int i; - - pernet = pm_nl_get_pernet(net); - - rcu_read_lock(); - for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { - if (test_bit(i, pernet->id_bitmap)) { - entry = __lookup_addr_by_id(pernet, i); - if (!entry) - break; - - if (entry->addr.id <= id) - continue; - - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) - break; - - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; - } - - id = entry->addr.id; - genlmsg_end(msg, hdr); - } - } - rcu_read_unlock(); - - cb->args[0] = id; - return msg->len; -} - -int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - return mptcp_pm_dump_addr(msg, cb); -} -static int parse_limit(struct genl_info *info, int id, unsigned int *limit) -{ - struct nlattr *attr = info->attrs[id]; - - if (!attr) - return 0; + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + return -EINVAL; - *limit = nla_get_u32(attr); - if (*limit > MPTCP_PM_ADDR_MAX) { - GENL_SET_ERR_MSG(info, "limit greater than maximum"); + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); return -EINVAL; } - return 0; -} - -int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - unsigned int rcv_addrs, subflows; - int ret; - - spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; - ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); - if (ret) - goto unlock; - - subflows = pernet->subflows_max; - ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); - if (ret) - goto unlock; - - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); -unlock: - spin_unlock_bh(&pernet->lock); - return ret; + genlmsg_end(msg, hdr); + return 0; } -int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) +static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct sk_buff *msg; - void *reply; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; + const struct genl_info *info = genl_info_dump(cb); - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - MPTCP_PM_CMD_GET_LIMITS); - if (!reply) - goto fail; - - if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) - goto fail; - - if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) - goto fail; - - genlmsg_end(msg, reply); - return genlmsg_reply(msg, info); - -fail: - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - nlmsg_free(msg); - return -EMSGSIZE; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_dump_addr(msg, cb); + return mptcp_pm_nl_dump_addr(msg, cb); } -static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) +int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) { - struct mptcp_rm_list list = { .nr = 0 }; - - list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - - spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); - __mark_subflow_endp_available(msk, list.ids[0]); - mptcp_pm_create_subflow_or_signal_addr(msk); - spin_unlock_bh(&msk->pm.lock); + return mptcp_pm_dump_addr(msg, cb); } -static int mptcp_nl_set_flags(struct net *net, - struct mptcp_addr_info *addr, - u8 bkup, u8 changed) +static int mptcp_pm_set_flags(struct genl_info *info) { - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; + struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_loc; int ret = -EINVAL; - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - - if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) - goto next; - - lock_sock(sk); - if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); - if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) - mptcp_pm_nl_fullmesh(msk, addr); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return ret; -} - -int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) -{ - struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | - MPTCP_PM_ADDR_FLAG_FULLMESH; - struct net *net = sock_net(skb->sk); - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - u8 lookup_by_id = 0; - u8 bkup = 0; - int ret; - - pernet = pm_nl_get_pernet(net); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) + return ret; - ret = mptcp_pm_parse_entry(attr, info, false, &addr); + attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; + ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); if (ret < 0) return ret; - if (addr.addr.family == AF_UNSPEC) { - lookup_by_id = 1; - if (!addr.addr.id) { - GENL_SET_ERR_MSG(info, "missing required inputs"); - return -EOPNOTSUPP; - } - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) - bkup = 1; - - spin_lock_bh(&pernet->lock); - entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : - __lookup_addr(pernet, &addr.addr); - if (!entry) { - spin_unlock_bh(&pernet->lock); - GENL_SET_ERR_MSG(info, "address not found"); - return -EINVAL; - } - if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | - MPTCP_PM_ADDR_FLAG_IMPLICIT))) { - spin_unlock_bh(&pernet->lock); - GENL_SET_ERR_MSG(info, "invalid addr flags"); - return -EINVAL; - } - - changed = (addr.flags ^ entry->flags) & mask; - entry->flags = (entry->flags & ~mask) | (addr.flags & mask); - addr = *entry; - spin_unlock_bh(&pernet->lock); - - mptcp_nl_set_flags(net, &addr.addr, bkup, changed); - return 0; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_set_flags(&loc, info); + return mptcp_pm_nl_set_flags(&loc, info); } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { - return mptcp_pm_set_flags(skb, info); + return mptcp_pm_set_flags(info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) @@ -2078,9 +311,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; @@ -2307,9 +538,7 @@ void mptcp_event_pm_listener(const struct sock *ssk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) goto nla_put_failure; break; } @@ -2397,52 +626,8 @@ struct genl_family mptcp_genl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; -static int __net_init pm_nl_init_net(struct net *net) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - - INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - - /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; - pernet->next_id = 1; - pernet->stale_loss_cnt = 4; - spin_lock_init(&pernet->lock); - - /* No need to initialize other pernet fields, the struct is zeroed at - * allocation time. - */ - - return 0; -} - -static void __net_exit pm_nl_exit_net(struct list_head *net_list) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) { - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - - /* net is removed from namespace list, can't race with - * other modifiers, also netns core already waited for a - * RCU grace period. - */ - __flush_addrs(&pernet->local_addr_list); - } -} - -static struct pernet_operations mptcp_pm_pernet_ops = { - .init = pm_nl_init_net, - .exit_batch = pm_nl_exit_net, - .id = &pm_nl_pernet_id, - .size = sizeof(struct pm_nl_pernet), -}; - void __init mptcp_pm_nl_init(void) { - if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) - panic("Failed to register MPTCP PM pernet subsystem.\n"); - if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a3d477059b11..2cb62f026b1f 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -12,15 +12,12 @@ list_for_each_entry(__entry, \ &((__msk)->pm.userspace_pm_local_addr_list), list) -void mptcp_free_local_addr_list(struct mptcp_sock *msk) +void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); - if (!mptcp_pm_is_userspace(msk)) - return; - spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); spin_unlock_bh(&msk->pm.lock); @@ -48,7 +45,6 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, bool needs_id) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; @@ -63,26 +59,21 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); - if (addr_match && id_match) { - match = e; - break; - } else if (addr_match || id_match) { + if (addr_match || id_match) break; - } __set_bit(e->addr.id, id_bitmap); } - if (!match && !addr_match && !id_match) { + if (!addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ - e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + e = sock_kmemdup(sk, entry, sizeof(*entry), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } - *e = *entry; if (!e->addr.id && needs_id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, @@ -90,7 +81,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; - } else if (match) { + } else if (addr_match && id_match) { ret = entry->addr.id; } @@ -136,27 +127,22 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, - struct mptcp_addr_info *skc) + struct mptcp_pm_addr_entry *skc) { - struct mptcp_pm_addr_entry *entry = NULL, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; + struct mptcp_pm_addr_entry *entry; spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr(msk, skc); + entry = mptcp_userspace_pm_lookup_addr(msk, &skc->addr); spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; - memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); - new_entry.addr = *skc; - new_entry.addr.id = 0; - new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - - if (new_entry.addr.port == msk_sport) - new_entry.addr.port = 0; + if (skc->addr.port == msk_sport) + skc->addr.port = 0; - return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); + return mptcp_userspace_pm_append_new_local_addr(msk, skc, true); } bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, @@ -175,14 +161,13 @@ bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info) { - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct mptcp_sock *msk; + struct nlattr *token; - if (!token) { - GENL_SET_ERR_MSG(info, "missing required token"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_TOKEN)) return NULL; - } + token = info->attrs[MPTCP_PM_ATTR_TOKEN]; msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token)); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); @@ -190,7 +175,8 @@ static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *in } if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + NL_SET_ERR_MSG_ATTR(info->extack, token, + "userspace PM not selected"); sock_put((struct sock *)msk); return NULL; } @@ -200,16 +186,14 @@ static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *in int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; + struct nlattr *addr; int err = -EINVAL; struct sock *sk; - if (!addr) { - GENL_SET_ERR_MSG(info, "missing required address"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) return err; - } msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -217,21 +201,27 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) sk = (struct sock *)msk; + addr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(addr, info, true, &addr_val); - if (err < 0) { - GENL_SET_ERR_MSG(info, "error parsing local address"); + if (err < 0) + goto announce_err; + + if (addr_val.addr.id == 0) { + NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr id"); + err = -EINVAL; goto announce_err; } - if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + if (!(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr flags"); err = -EINVAL; goto announce_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val, false); if (err < 0) { - GENL_SET_ERR_MSG(info, "did not match address and id"); + NL_SET_ERR_MSG_ATTR(info->extack, addr, + "did not match address and id"); goto announce_err; } @@ -241,7 +231,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); @@ -253,8 +243,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) return err; } -static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, - struct genl_info *info) +static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk) { struct mptcp_rm_list list = { .nr = 0 }; struct mptcp_subflow_context *subflow; @@ -269,10 +258,8 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, break; } } - if (!has_id_0) { - GENL_SET_ERR_MSG(info, "address with id 0 not found"); + if (!has_id_0) goto remove_err; - } list.ids[list.nr++] = 0; @@ -309,18 +296,17 @@ void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; struct mptcp_sock *msk; + struct nlattr *id; int err = -EINVAL; struct sock *sk; u8 id_val; - if (!id) { - GENL_SET_ERR_MSG(info, "missing required ID"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_LOC_ID)) return err; - } + id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; id_val = nla_get_u8(id); msk = mptcp_userspace_pm_get_sock(info); @@ -330,7 +316,7 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) sk = (struct sock *)msk; if (id_val == 0) { - err = mptcp_userspace_pm_remove_id_zero_address(msk, info); + err = mptcp_userspace_pm_remove_id_zero_address(msk); goto out; } @@ -339,7 +325,6 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { - GENL_SET_ERR_MSG(info, "address with specified id not found"); spin_unlock_bh(&msk->pm.lock); release_sock(sk); goto out; @@ -356,25 +341,28 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) err = 0; out: + if (err) + NL_SET_ERR_MSG_ATTR_FMT(info->extack, id, + "address with id %u not found", + id_val); + sock_put(sk); return err; } int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry entry = { 0 }; struct mptcp_addr_info addr_r; + struct nlattr *raddr, *laddr; struct mptcp_pm_local local; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - if (!laddr || !raddr) { - GENL_SET_ERR_MSG(info, "missing required address(es)"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) || + GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) return err; - } msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -382,24 +370,22 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) sk = (struct sock *)msk; + laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(laddr, info, true, &entry); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + if (err < 0) goto create_err; - } if (entry.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - GENL_SET_ERR_MSG(info, "invalid addr flags"); + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "invalid addr flags"); err = -EINVAL; goto create_err; } entry.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; err = mptcp_pm_parse_addr(raddr, info, &addr_r); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + if (err < 0) goto create_err; - } if (!mptcp_pm_addr_families_match(sk, &entry.addr, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); @@ -409,7 +395,8 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) err = mptcp_userspace_pm_append_new_local_addr(msk, &entry, false); if (err < 0) { - GENL_SET_ERR_MSG(info, "did not match address and id"); + NL_SET_ERR_MSG_ATTR(info->extack, laddr, + "did not match address and id"); goto create_err; } @@ -421,6 +408,9 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) err = __mptcp_subflow_connect(sk, &local, &addr_r); release_sock(sk); + if (err) + GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err); + spin_lock_bh(&msk->pm.lock); if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); @@ -461,9 +451,7 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *pinfo = inet6_sk(ssk); - - if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + if (!ipv6_addr_equal(&local->addr6, &issk->pinet6->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; @@ -483,18 +471,16 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_l; struct mptcp_addr_info addr_r; + struct nlattr *raddr, *laddr; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; - if (!laddr || !raddr) { - GENL_SET_ERR_MSG(info, "missing required address(es)"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) || + GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) return err; - } msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -502,17 +488,15 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info sk = (struct sock *)msk; + laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(laddr, info, true, &addr_l); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + if (err < 0) goto destroy_err; - } + raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; err = mptcp_pm_parse_addr(raddr, info, &addr_r); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + if (err < 0) goto destroy_err; - } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { @@ -530,8 +514,14 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info goto destroy_err; } - if (!addr_l.addr.port || !addr_r.port) { - GENL_SET_ERR_MSG(info, "missing local or remote port"); + if (!addr_l.addr.port) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local port"); + err = -EINVAL; + goto destroy_err; + } + + if (!addr_r.port) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "missing remote port"); err = -EINVAL; goto destroy_err; } @@ -539,6 +529,7 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r); if (!ssk) { + GENL_SET_ERR_MSG(info, "subflow not found"); err = -ESRCH; goto release_sock; } @@ -557,46 +548,51 @@ destroy_err: return err; } -int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) +int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) { - struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; - struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info rem = { .family = AF_UNSPEC, }; struct mptcp_pm_addr_entry *entry; + struct nlattr *attr, *attr_rem; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; u8 bkup = 0; + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) + return ret; + msk = mptcp_userspace_pm_get_sock(info); if (!msk) return ret; sk = (struct sock *)msk; - ret = mptcp_pm_parse_entry(attr, info, false, &loc); - if (ret < 0) + attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + if (local->addr.family == AF_UNSPEC) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "invalid local address family"); + ret = -EINVAL; goto set_flags_err; - - if (attr_rem) { - ret = mptcp_pm_parse_entry(attr_rem, info, false, &rem); - if (ret < 0) - goto set_flags_err; } - if (loc.addr.family == AF_UNSPEC || - rem.addr.family == AF_UNSPEC) { - GENL_SET_ERR_MSG(info, "invalid address families"); + attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + ret = mptcp_pm_parse_addr(attr_rem, info, &rem); + if (ret < 0) + goto set_flags_err; + + if (rem.family == AF_UNSPEC) { + NL_SET_ERR_MSG_ATTR(info->extack, attr_rem, + "invalid remote address family"); ret = -EINVAL; goto set_flags_err; } - if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + if (local->flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr(msk, &loc.addr); + entry = mptcp_userspace_pm_lookup_addr(msk, &local->addr); if (entry) { if (bkup) entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; @@ -606,9 +602,13 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&msk->pm.lock); lock_sock(sk); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); + ret = mptcp_pm_mp_prio_send_ack(msk, &local->addr, &rem, bkup); release_sock(sk); + /* mptcp_pm_mp_prio_send_ack() only fails in one case */ + if (ret < 0) + GENL_SET_ERR_MSG(info, "subflow not found"); + set_flags_err: sock_put(sk); return ret; @@ -625,7 +625,8 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; - void *hdr; + + BUILD_BUG_ON(sizeof(struct id_bitmap) > sizeof(cb->ctx)); bitmap = (struct id_bitmap *)cb->ctx; @@ -641,19 +642,10 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, if (test_bit(entry->addr.id, bitmap->map)) continue; - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) break; - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; - } - __set_bit(entry->addr.id, bitmap->map); - genlmsg_end(msg, hdr); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -663,16 +655,13 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, return ret; } -int mptcp_userspace_pm_get_addr(struct sk_buff *skb, +int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct mptcp_pm_addr_entry addr, *entry; + struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - struct sk_buff *msg; int ret = -EINVAL; struct sock *sk; - void *reply; msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -680,50 +669,26 @@ int mptcp_userspace_pm_get_addr(struct sk_buff *skb, sk = (struct sock *)msk; - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - goto out; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - ret = -ENOMEM; - goto out; - } - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - info->genlhdr->cmd); - if (!reply) { - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - ret = -EMSGSIZE; - goto fail; - } - lock_sock(sk); spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr_by_id(msk, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - ret = -EINVAL; - goto unlock_fail; + entry = mptcp_userspace_pm_lookup_addr_by_id(msk, id); + if (entry) { + *addr = *entry; + ret = 0; } - - ret = mptcp_nl_fill_addr(msg, entry); - if (ret) - goto unlock_fail; - - genlmsg_end(msg, reply); - ret = genlmsg_reply(msg, info); spin_unlock_bh(&msk->pm.lock); release_sock(sk); - sock_put(sk); - return ret; -unlock_fail: - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); -fail: - nlmsg_free(msg); -out: sock_put(sk); return ret; } + +static struct mptcp_pm_ops mptcp_pm_userspace = { + .name = "userspace", + .owner = THIS_MODULE, +}; + +void __init mptcp_pm_userspace_register(void) +{ + mptcp_pm_register(&mptcp_pm_userspace); +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6bd819047470..44f7ab463d75 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -118,24 +118,14 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) -{ - WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, - mptcp_sk(sk)->rmem_fwd_alloc + size); -} - -static void mptcp_rmem_charge(struct sock *sk, int size) -{ - mptcp_rmem_fwd_alloc_add(sk, -size); -} - static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; - if (MPTCP_SKB_CB(from)->offset || + if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || + MPTCP_SKB_CB(from)->offset || ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; @@ -150,7 +140,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); - mptcp_rmem_charge(sk, delta); + sk_mem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; @@ -165,44 +155,6 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } -static void __mptcp_rmem_reclaim(struct sock *sk, int amount) -{ - amount >>= PAGE_SHIFT; - mptcp_rmem_charge(sk, amount << PAGE_SHIFT); - __sk_mem_reduce_allocated(sk, amount); -} - -static void mptcp_rmem_uncharge(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - int reclaimable; - - mptcp_rmem_fwd_alloc_add(sk, size); - reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); - - /* see sk_mem_uncharge() for the rationale behind the following schema */ - if (unlikely(reclaimable >= PAGE_SIZE)) - __mptcp_rmem_reclaim(sk, reclaimable); -} - -static void mptcp_rfree(struct sk_buff *skb) -{ - unsigned int len = skb->truesize; - struct sock *sk = skb->sk; - - atomic_sub(len, &sk->sk_rmem_alloc); - mptcp_rmem_uncharge(sk, len); -} - -void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = mptcp_rfree; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); - mptcp_rmem_charge(sk, skb->truesize); -} - /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -315,25 +267,7 @@ merge_right: end: skb_condense(skb); - mptcp_set_owner_r(skb, sk); -} - -static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - int amt, amount; - - if (size <= msk->rmem_fwd_alloc) - return true; - - size -= msk->rmem_fwd_alloc; - amt = sk_mem_pages(size); - amount = amt << PAGE_SHIFT; - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) - return false; - - mptcp_rmem_fwd_alloc_add(sk, amount); - return true; + skb_set_owner_r(skb, sk); } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -351,7 +285,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } @@ -366,6 +300,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; + MPTCP_SKB_CB(skb)->cant_coalesce = 0; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -375,7 +310,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; - mptcp_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { @@ -487,7 +422,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; + icsk_timeout(inet_csk(ssk)) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) @@ -561,7 +496,7 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)) && copied; - rx_empty = !__mptcp_rmem(sk) && copied; + rx_empty = !sk_rmem_alloc_get(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -634,27 +569,13 @@ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, - struct sock *ssk, - unsigned int *bytes) + struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - bool done = false; - int sk_rbuf; - - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); - - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); - - if (unlikely(ssk_rbuf > sk_rbuf)) { - WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); - sk_rbuf = ssk_rbuf; - } - } + bool ret = false; pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); @@ -664,20 +585,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sk_buff *skb; bool fin; + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) + break; + /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); - if (!skb) { - /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), - * a different CPU can have already processed the pending - * data, stop here or we can enter an infinite loop - */ - if (!moved) - done = true; + if (unlikely(!skb)) break; - } if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could @@ -690,19 +607,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - if (fin) { - done = true; + if (fin) seq++; - } if (offset < skb->len) { size_t len = skb->len - offset; - if (tp->urg_data) - done = true; - - if (__mptcp_move_skb(msk, ssk, skb, offset, len)) - moved += len; + ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; seq += len; if (unlikely(map_remaining < len)) { @@ -716,22 +627,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } sk_eat_skb(ssk, skb); - done = true; } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { - done = true; - break; - } } while (more_data_avail); - if (moved > 0) + if (ret) msk->last_data_recv = tcp_jiffies32; - *bytes += moved; - return done; + return ret; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) @@ -825,9 +730,9 @@ void __mptcp_error_report(struct sock *sk) static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; + bool moved; - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + moved = __mptcp_move_skbs_from_subflow(msk, ssk); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) @@ -843,14 +748,29 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); - return moved > 0; + return moved; +} + +static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) +{ + if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) + WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); +} + +static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + __mptcp_rcvbuf_update(sk, ssk); + + /* Wake-up the reader only for in-sequence data */ + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct mptcp_sock *msk = mptcp_sk(sk); - int sk_rbuf, ssk_rbuf; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing @@ -859,19 +779,11 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (unlikely(subflow->disposable)) return; - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); - if (unlikely(ssk_rbuf > sk_rbuf)) - sk_rbuf = ssk_rbuf; - - /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (__mptcp_rmem(sk) > sk_rbuf) - return; - - /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) - sk->sk_data_ready(sk); + if (!sock_owned_by_user(sk)) + __mptcp_data_ready(sk, ssk); + else + __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } @@ -950,20 +862,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - msk_owned_by_me(msk); - - mptcp_for_each_subflow(msk, subflow) { - if (READ_ONCE(subflow->data_avail)) - return mptcp_subflow_tcp_sock(subflow); - } - - return NULL; -} - static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) @@ -1944,16 +1842,17 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { + struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; int copied = 0; - skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1985,10 +1884,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } if (!(flags & MSG_PEEK)) { - /* we will bulk release the skb memory later */ + /* avoid the indirect call, we know the destructor is sock_wfree */ skb->destructor = NULL; - WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); - __skb_unlink(skb, &msk->receive_queue); + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } @@ -2101,66 +2001,65 @@ new_measure: msk->rcvq_space.time = mstamp; } -static void __mptcp_update_rmem(struct sock *sk) +static struct mptcp_subflow_context * +__mptcp_first_ready_from(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!msk->rmem_released) - return; + struct mptcp_subflow_context *start_subflow = subflow; - atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); - mptcp_rmem_uncharge(sk, msk->rmem_released); - WRITE_ONCE(msk->rmem_released, 0); + while (!READ_ONCE(subflow->data_avail)) { + subflow = mptcp_next_subflow(msk, subflow); + if (subflow == start_subflow) + return NULL; + } + return subflow; } -static void __mptcp_splice_receive_queue(struct sock *sk) +static bool __mptcp_move_skbs(struct sock *sk) { + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); + bool ret = false; - skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); -} + if (list_empty(&msk->conn_list)) + return false; -static bool __mptcp_move_skbs(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; - bool ret, done; + /* verify we can move any data from the subflow, eventually updating */ + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + mptcp_for_each_subflow(msk, subflow) + __mptcp_rcvbuf_update(sk, subflow->tcp_sock); - do { - struct sock *ssk = mptcp_subflow_recv_lookup(msk); + subflow = list_first_entry(&msk->conn_list, + struct mptcp_subflow_context, node); + for (;;) { + struct sock *ssk; bool slowpath; - /* we can have data pending in the subflows only if the msk - * receive buffer was full at subflow_data_ready() time, - * that is an unlikely slow path. + /* + * As an optimization avoid traversing the subflows list + * and ev. acquiring the subflow socket lock before baling out */ - if (likely(!ssk)) + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break; - slowpath = lock_sock_fast(ssk); - mptcp_data_lock(sk); - __mptcp_update_rmem(sk); - done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_data_unlock(sk); + subflow = __mptcp_first_ready_from(msk, subflow); + if (!subflow) + break; + ssk = mptcp_subflow_tcp_sock(subflow); + slowpath = lock_sock_fast(ssk); + ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); - } while (!done); - /* acquire the data lock only if some input data is pending */ - ret = moved > 0; - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || - !skb_queue_empty_lockless(&sk->sk_receive_queue)) { - mptcp_data_lock(sk); - __mptcp_update_rmem(sk); - ret |= __mptcp_ofo_queue(msk); - __mptcp_splice_receive_queue(sk); - mptcp_data_unlock(sk); + subflow = mptcp_next_subflow(msk, subflow); } + + __mptcp_ofo_queue(msk); if (ret) mptcp_check_data_fin((struct sock *)msk); - return !skb_queue_empty(&msk->receive_queue); + return ret; } static unsigned int mptcp_inq_hint(const struct sock *sk) @@ -2168,7 +2067,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; - skb = skb_peek(&msk->receive_queue); + skb = skb_peek(&sk->sk_receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; @@ -2214,7 +2113,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int err, bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2223,7 +2122,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) + if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit @@ -2249,7 +2148,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* race breaker: the shutdown could be after the * previous receive queue check */ - if (__mptcp_move_skbs(msk)) + if (__mptcp_move_skbs(sk)) continue; break; } @@ -2293,9 +2192,8 @@ out_err: } } - pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", - msk, skb_queue_empty_lockless(&sk->sk_receive_queue), - skb_queue_empty(&msk->receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d copied=%d\n", + msk, skb_queue_empty(&sk->sk_receive_queue), copied); release_sock(sk); return copied; @@ -2783,7 +2681,7 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_fastclose(msk); - mptcp_pm_nl_work(msk); + mptcp_pm_worker(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); @@ -2822,11 +2720,8 @@ static void __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); - __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - WRITE_ONCE(msk->rmem_fwd_alloc, 0); - WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; @@ -3052,8 +2947,6 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); - WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -3285,12 +3178,9 @@ static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); @@ -3405,21 +3295,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); - /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ - mptcp_data_lock(sk); - skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); - mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ - sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); - WRITE_ONCE(msk->rmem_fwd_alloc, 0); mptcp_token_destroy(msk); - mptcp_pm_free_anno_list(msk); - mptcp_free_local_addr_list(msk); + mptcp_pm_destroy(msk); } static void mptcp_destroy(struct sock *sk) @@ -3453,7 +3336,8 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ - BIT(MPTCP_FLUSH_JOIN_LIST)) + BIT(MPTCP_FLUSH_JOIN_LIST) | \ + BIT(MPTCP_DEQUEUE)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) @@ -3487,6 +3371,11 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); + if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { + /* notify ack seq update */ + mptcp_cleanup_rbuf(msk, 0); + sk->sk_data_ready(sk); + } cond_resched(); spin_lock_bh(&sk->sk_lock.slock); @@ -3506,8 +3395,6 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } - - __mptcp_update_rmem(sk); } /* MP_JOIN client subflow must wait for 4th ack before sending any data: @@ -3533,7 +3420,6 @@ static void schedule_3rdack_retransmission(struct sock *ssk) WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } @@ -3678,12 +3564,6 @@ static void mptcp_shutdown(struct sock *sk, int how) __mptcp_wr_shutdown(sk); } -static int mptcp_forward_alloc_get(const struct sock *sk) -{ - return READ_ONCE(sk->sk_forward_alloc) + - READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); -} - static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; @@ -3724,7 +3604,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return -EINVAL; lock_sock(sk); - __mptcp_move_skbs(msk); + if (__mptcp_move_skbs(sk)) + mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); break; @@ -3841,7 +3722,6 @@ static struct proto mptcp_prot = { .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, - .forward_alloc_get = mptcp_forward_alloc_get, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ad21925af061..d409586b5977 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -124,12 +124,14 @@ #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 +#define MPTCP_DEQUEUE 8 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; - u8 has_rxtstamp:1; + u8 has_rxtstamp; + u8 cant_coalesce; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) @@ -221,6 +223,8 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ + struct_group(reset, + u8 addr_signal; bool server_side; bool work_pending; @@ -233,6 +237,9 @@ struct mptcp_pm_data { u8 pm_type; u8 subflows; u8 status; + + ); + DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; @@ -279,7 +286,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; - int rmem_fwd_alloc; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; @@ -294,7 +300,6 @@ struct mptcp_sock { u32 last_ack_recv; unsigned long timer_ival; u32 token; - int rmem_released; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ @@ -324,7 +329,6 @@ struct mptcp_sock { struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; - struct sk_buff_head receive_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -355,6 +359,8 @@ struct mptcp_sock { list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) +#define mptcp_next_subflow(__msk, __subflow) \ + list_next_entry_circular(__subflow, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; @@ -381,14 +387,6 @@ static inline void msk_owned_by_me(const struct mptcp_sock *msk) #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif -/* the msk socket don't use the backlog, also account for the bulk - * free memory - */ -static inline int __mptcp_rmem(const struct sock *sk) -{ - return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); -} - static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); @@ -401,7 +399,8 @@ static inline int mptcp_space_from_win(const struct sock *sk, int win) static inline int __mptcp_space(const struct sock *sk) { - return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - + sk_rmem_alloc_get(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@ -700,6 +699,7 @@ int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); +const char *mptcp_get_path_manager(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void mptcp_active_disable(struct sock *sk); @@ -726,12 +726,14 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); -void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); -void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); +void mptcp_local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr); +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, @@ -990,6 +992,7 @@ __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum su void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); +void mptcp_pm_destroy(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, @@ -999,7 +1002,6 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -1013,34 +1015,35 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote); -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup); +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup); +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); -struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, - const struct mptcp_addr_info *addr); bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr); bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); -int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); -int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info); +int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); @@ -1048,7 +1051,16 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry); -void mptcp_free_local_addr_list(struct mptcp_sock *msk); +/* the default path manager, used in mptcp_pm_unregister */ +extern struct mptcp_pm_ops mptcp_pm_kernel; + +struct mptcp_pm_ops *mptcp_pm_find(const char *name); +int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); +void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); +int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); +void mptcp_pm_get_available(char *buf, size_t maxlen); + +void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); @@ -1058,12 +1070,11 @@ void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); -int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry); +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { @@ -1126,19 +1137,20 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); -int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info); -int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info); -int mptcp_userspace_pm_get_addr(struct sk_buff *skb, +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info); +int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) @@ -1150,8 +1162,11 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo return local_id; } +void __init mptcp_pm_kernel_register(void); +void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); -void mptcp_pm_nl_work(struct mptcp_sock *msk); +void mptcp_pm_worker(struct mptcp_sock *msk); +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index df7dbcfa3b71..c16c6fbd4ba2 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,13 +16,25 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, +static int mptcp_sched_default_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct sock *ssk; - ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : - mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; @@ -31,7 +43,8 @@ static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, } static struct mptcp_sched_ops mptcp_sched_default = { - .get_subflow = mptcp_sched_default_get_subflow, + .get_send = mptcp_sched_default_get_send, + .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; @@ -73,7 +86,7 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_subflow) + if (!sched->get_send) return -EINVAL; spin_lock(&mptcp_sched_list_lock); @@ -144,7 +157,7 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; + struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -164,16 +177,15 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) return 0; } - data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_send(msk, data); + return msk->sched->get_send(msk, data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; + struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -186,8 +198,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) return 0; } - data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_retrans(msk, data); + if (msk->sched->get_retrans) + return msk->sched->get_retrans(msk, data); + return msk->sched->get_send(msk, data); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 505445a9598f..3caa0a9d3b38 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1419,6 +1419,12 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); + case IP_FREEBIND: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(FREEBIND, sk)); + case IP_TRANSPARENT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(TRANSPARENT, sk)); case IP_BIND_ADDRESS_NO_PORT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); @@ -1430,6 +1436,26 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (void *)msk; + + switch (optname) { + case IPV6_V6ONLY: + return mptcp_put_int_option(msk, optval, optlen, + sk->sk_ipv6only); + case IPV6_TRANSPARENT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(TRANSPARENT, sk)); + case IPV6_FREEBIND: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(FREEBIND, sk)); + } + + return -EOPNOTSUPP; +} + static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -1469,6 +1495,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_IP) return mptcp_getsockopt_v4(msk, optname, optval, option); + if (level == SOL_IPV6) + return mptcp_getsockopt_v6(msk, optname, optval, option); if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); if (level == SOL_MPTCP) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9f18217dddc8..efe8d86496db 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -802,9 +802,6 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); - - if (subflow->is_mptfo) - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -1270,7 +1267,12 @@ out: subflow->map_valid = 0; } -/* sched mptcp worker to remove the subflow if no more data is pending */ +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +/* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; @@ -1280,8 +1282,18 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; - if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + if (!skb_queue_empty(&ssk->sk_receive_queue)) + return; + + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work(sk); + + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && + msk->first == ssk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) mptcp_schedule_work(sk); } @@ -1826,11 +1838,6 @@ static void __subflow_state_change(struct sock *sk) rcu_read_unlock(); } -static bool subflow_is_done(const struct sock *sk) -{ - return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; -} - static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1857,13 +1864,6 @@ static void subflow_state_change(struct sock *sk) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); - - /* when the fallback subflow closes the rx side, trigger a 'dummy' - * ingress data fin, so that the msk state will follow along - */ - if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) - mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) |