diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 13 | ||||
-rw-r--r-- | include/linux/rhashtable.h | 8 | ||||
-rw-r--r-- | include/linux/skbuff.h | 1 | ||||
-rw-r--r-- | include/net/inet_frag.h | 126 | ||||
-rw-r--r-- | include/net/ip.h | 1 | ||||
-rw-r--r-- | include/net/ipv6.h | 27 | ||||
-rw-r--r-- | lib/rhashtable.c | 2 | ||||
-rw-r--r-- | net/ieee802154/6lowpan/6lowpan_i.h | 26 | ||||
-rw-r--r-- | net/ieee802154/6lowpan/reassembly.c | 148 | ||||
-rw-r--r-- | net/ipv4/inet_fragment.c | 358 | ||||
-rw-r--r-- | net/ipv4/ip_fragment.c | 253 | ||||
-rw-r--r-- | net/ipv4/proc.c | 6 | ||||
-rw-r--r-- | net/ipv6/netfilter/nf_conntrack_reasm.c | 123 | ||||
-rw-r--r-- | net/ipv6/proc.c | 5 | ||||
-rw-r--r-- | net/ipv6/reassembly.c | 229 |
15 files changed, 499 insertions, 827 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 33f35f049ad5..5dc1a040a2f1 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -133,14 +133,11 @@ min_adv_mss - INTEGER IP Fragmentation: -ipfrag_high_thresh - INTEGER - Maximum memory used to reassemble IP fragments. When - ipfrag_high_thresh bytes of memory is allocated for this purpose, - the fragment handler will toss packets until ipfrag_low_thresh - is reached. This also serves as a maximum limit to namespaces - different from the initial one. - -ipfrag_low_thresh - INTEGER +ipfrag_high_thresh - LONG INTEGER + Maximum memory used to reassemble IP fragments. + +ipfrag_low_thresh - LONG INTEGER + (Obsolete since linux-4.17) Maximum memory used to reassemble IP fragments before the kernel begins to remove incomplete fragment queues to free up resources. The kernel still accepts new fragments for defragmentation. diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 668a21f04b09..1f8ad121eb43 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -152,25 +152,25 @@ struct rhashtable_params { /** * struct rhashtable - Hash table handle * @tbl: Bucket table - * @nelems: Number of elements in table * @key_len: Key length for hashfn - * @p: Configuration parameters * @max_elems: Maximum number of elements in table + * @p: Configuration parameters * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table */ struct rhashtable { struct bucket_table __rcu *tbl; - atomic_t nelems; unsigned int key_len; - struct rhashtable_params p; unsigned int max_elems; + struct rhashtable_params p; bool rhlist; struct work_struct run_work; struct mutex mutex; spinlock_t lock; + atomic_t nelems; }; /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47082f54ec1f..9065477ed255 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -672,6 +672,7 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; + int ip_defrag_offset; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 351f0c3cdcd9..ed07e3786d98 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -2,14 +2,20 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ +#include <linux/rhashtable.h> + struct netns_frags { - /* Keep atomic mem on separate cachelines in structs that include it */ - atomic_t mem ____cacheline_aligned_in_smp; /* sysctls */ + long high_thresh; + long low_thresh; int timeout; - int high_thresh; - int low_thresh; int max_dist; + struct inet_frags *f; + + struct rhashtable rhashtable ____cacheline_aligned_in_smp; + + /* Keep atomic mem on separate cachelines in structs that include it */ + atomic_long_t mem ____cacheline_aligned_in_smp; }; /** @@ -25,12 +31,30 @@ enum { INET_FRAG_COMPLETE = BIT(2), }; +struct frag_v4_compare_key { + __be32 saddr; + __be32 daddr; + u32 user; + u32 vif; + __be16 id; + u16 protocol; +}; + +struct frag_v6_compare_key { + struct in6_addr saddr; + struct in6_addr daddr; + u32 user; + __be32 id; + u32 iif; +}; + /** * struct inet_frag_queue - fragment queue * - * @lock: spinlock protecting the queue + * @node: rhash node + * @key: keys identifying this frag. * @timer: queue expiration timer - * @list: hash bucket list + * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @fragments: received fragments head * @fragments_tail: received fragments tail @@ -40,12 +64,16 @@ enum { * @flags: fragment queue flags * @max_size: maximum received fragment size * @net: namespace that this frag belongs to - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) + * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { - spinlock_t lock; + struct rhash_head node; + union { + struct frag_v4_compare_key v4; + struct frag_v6_compare_key v6; + } key; struct timer_list timer; - struct hlist_node list; + spinlock_t lock; refcount_t refcnt; struct sk_buff *fragments; struct sk_buff *fragments_tail; @@ -54,101 +82,57 @@ struct inet_frag_queue { int meat; __u8 flags; u16 max_size; - struct netns_frags *net; - struct hlist_node list_evictor; -}; - -#define INETFRAGS_HASHSZ 1024 - -/* averaged: - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or - * struct frag_queue)) - */ -#define INETFRAGS_MAXDEPTH 128 - -struct inet_frag_bucket { - struct hlist_head chain; - spinlock_t chain_lock; + struct netns_frags *net; + struct rcu_head rcu; }; struct inet_frags { - struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; - - struct work_struct frags_work; - unsigned int next_bucket; - unsigned long last_rebuild_jiffies; - bool rebuild; - - /* The first call to hashfn is responsible to initialize - * rnd. This is best done with net_get_random_once. - * - * rnd_seqlock is used to let hash insertion detect - * when it needs to re-lookup the hash chain to use. - */ - u32 rnd; - seqlock_t rnd_seqlock; unsigned int qsize; - unsigned int (*hashfn)(const struct inet_frag_queue *); - bool (*match)(const struct inet_frag_queue *q, - const void *arg); void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; + struct rhashtable_params rhash_params; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); -static inline void inet_frags_init_net(struct netns_frags *nf) +static inline int inet_frags_init_net(struct netns_frags *nf) { - atomic_set(&nf->mem, 0); + atomic_long_set(&nf->mem, 0); + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); } -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); - -void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash); +void inet_frags_exit_net(struct netns_frags *nf); -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix); +void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_destroy(struct inet_frag_queue *q); +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); -static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) +static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f); -} - -static inline bool inet_frag_evicting(struct inet_frag_queue *q) -{ - return !hlist_unhashed(&q->list_evictor); + inet_frag_destroy(q); } /* Memory Tracking Functions. */ -static inline int frag_mem_limit(struct netns_frags *nf) -{ - return atomic_read(&nf->mem); -} - -static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) +static inline long frag_mem_limit(const struct netns_frags *nf) { - atomic_sub(i, &nf->mem); + return atomic_long_read(&nf->mem); } -static inline void add_frag_mem_limit(struct netns_frags *nf, int i) +static inline void sub_frag_mem_limit(struct netns_frags *nf, long val) { - atomic_add(i, &nf->mem); + atomic_long_sub(val, &nf->mem); } -static inline int sum_frag_mem_limit(struct netns_frags *nf) +static inline void add_frag_mem_limit(struct netns_frags *nf, long val) { - return atomic_read(&nf->mem); + atomic_long_add(val, &nf->mem); } /* RFC 3168 support : diff --git a/include/net/ip.h b/include/net/ip.h index 36f8f7811093..ecffd843e7b8 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s return skb; } #endif -int ip_frag_mem(struct net *net); /* * Functions provided by ip_forward.c diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 50a6f0ddb878..37455e840347 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev) idev->cnf.accept_ra; } -#if IS_ENABLED(CONFIG_IPV6) -static inline int ip6_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv6.frags); -} -#endif - #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ @@ -579,17 +572,8 @@ enum ip6_defrag_users { __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; -struct ip6_create_arg { - __be32 id; - u32 user; - const struct in6_addr *src; - const struct in6_addr *dst; - int iif; - u8 ecn; -}; - void ip6_frag_init(struct inet_frag_queue *q, const void *a); -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); +extern const struct rhashtable_params ip6_rhash_params; /* * Equivalent of ipv4 struct ip @@ -597,19 +581,12 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); struct frag_queue { struct inet_frag_queue q; - __be32 id; /* fragment id */ - u32 user; - struct in6_addr saddr; - struct in6_addr daddr; - int iif; - unsigned int csum; __u16 nhoffset; u8 ecn; }; -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags); +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); static inline bool ipv6_addr_any(const struct in6_addr *a) { diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 47de025b6245..2b2b79974b61 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht) err = rhashtable_rehash_chain(ht, old_hash); if (err) return err; + cond_resched(); } /* Publish the new table pointer. */ @@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, for (i = 0; i < tbl->size; i++) { struct rhash_head *pos, *next; + cond_resched(); for (pos = rht_dereference(*rht_bucket(tbl, i), ht), next = !rht_is_a_nulls(pos) ? rht_dereference(pos->next, ht) : NULL; diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index d8de3bcfb103..b8d95cb71c25 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result; #define LOWPAN_DISPATCH_FRAG1 0xc0 #define LOWPAN_DISPATCH_FRAGN 0xe0 -struct lowpan_create_arg { +struct frag_lowpan_compare_key { u16 tag; u16 d_size; - const struct ieee802154_addr *src; - const struct ieee802154_addr *dst; + const struct ieee802154_addr src; + const struct ieee802154_addr dst; }; -/* Equivalent of ipv4 struct ip +/* Equivalent of ipv4 struct ipq */ struct lowpan_frag_queue { struct inet_frag_queue q; - - u16 tag; - u16 d_size; - struct ieee802154_addr saddr; - struct ieee802154_addr daddr; }; -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) -{ - switch (a->mode) { - case IEEE802154_ADDR_LONG: - return (((__force u64)a->extended_addr) >> 32) ^ - (((__force u64)a->extended_addr) & 0xffffffff); - case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr + (a->pan_id << 16)); - default: - return 0; - } -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 85bf86ad6b18..44f148a6bb57 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct net_device *ldev); -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, - const struct ieee802154_addr *saddr, - const struct ieee802154_addr *daddr) -{ - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); - return jhash_3words(ieee802154_addr_hash(saddr), - ieee802154_addr_hash(daddr), - (__force u32)(tag + (d_size << 16)), - lowpan_frags.rnd); -} - -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) -{ - const struct lowpan_frag_queue *fq; - - fq = container_of(q, struct lowpan_frag_queue, q); - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); -} - -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct lowpan_frag_queue *fq; - const struct lowpan_create_arg *arg = a; - - fq = container_of(q, struct lowpan_frag_queue, q); - return fq->tag == arg->tag && fq->d_size == arg->d_size && - ieee802154_addr_equal(&fq->saddr, arg->src) && - ieee802154_addr_equal(&fq->daddr, arg->dst); -} - static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { - const struct lowpan_create_arg *arg = a; + const struct frag_lowpan_compare_key *key = a; struct lowpan_frag_queue *fq; fq = container_of(q, struct lowpan_frag_queue, q); - fq->tag = arg->tag; - fq->d_size = arg->d_size; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); + memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) @@ -94,10 +62,10 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * @@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { - struct inet_frag_queue *q; - struct lowpan_create_arg arg; - unsigned int hash; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + struct frag_lowpan_compare_key key = { + .tag = cb->d_tag, + .d_size = cb->d_size, + .src = *src, + .dst = *dst, + }; + struct inet_frag_queue *q; - arg.tag = cb->d_tag; - arg.d_size = cb->d_size; - arg.src = src; - arg.dst = dst; - - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); - - q = inet_frag_find(&ieee802154_lowpan->frags, - &lowpan_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&ieee802154_lowpan->frags, &key); + if (!q) return NULL; - } + return container_of(q, struct lowpan_frag_queue, q); } @@ -230,7 +193,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev, struct sk_buff *fp, *head = fq->q.fragments; int sum_truesize; - inet_frag_kill(&fq->q, &lowpan_frags); + inet_frag_kill(&fq->q); /* Make the one we just received the head. */ if (prev) { @@ -438,7 +401,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &lowpan_frags); + inet_frag_put(&fq->q); return ret; } @@ -448,23 +411,23 @@ err: } #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .data = &init_net.ieee802154_lowpan.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh }, { .procname = "6lowpanfrag_low_thresh", .data = &init_net.ieee802154_lowpan.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh }, @@ -581,14 +544,20 @@ static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); + int res; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; + ieee802154_lowpan->frags.f = &lowpan_frags; - inet_frags_init_net(&ieee802154_lowpan->frags); - - return lowpan_frags_ns_sysctl_register(net); + res = inet_frags_init_net(&ieee802154_lowpan->frags); + if (res < 0) + return res; + res = lowpan_frags_ns_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&ieee802154_lowpan->frags); + return res; } static void __net_exit lowpan_frags_exit_net(struct net *net) @@ -597,7 +566,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net) net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags); + inet_frags_exit_net(&ieee802154_lowpan->frags); } static struct pernet_operations lowpan_frags_ops = { @@ -605,32 +574,63 @@ static struct pernet_operations lowpan_frags_ops = { .exit = lowpan_frags_exit_net, }; -int __init lowpan_net_frag_init(void) +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) { - int ret; + return jhash2(data, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} - ret = lowpan_frags_sysctl_register(); - if (ret) - return ret; +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; - ret = register_pernet_subsys(&lowpan_frags_ops); - if (ret) - goto err_pernet; + return jhash2((const u32 *)&fq->key, + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); +} + +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_lowpan_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params lowpan_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = lowpan_key_hashfn, + .obj_hashfn = lowpan_obj_hashfn, + .obj_cmpfn = lowpan_obj_cmpfn, + .automatic_shrinking = true, +}; + +int __init lowpan_net_frag_init(void) +{ + int ret; - lowpan_frags.hashfn = lowpan_hashfn; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); - lowpan_frags.match = lowpan_frag_match; lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; + lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) - goto err_pernet; + goto out; + ret = lowpan_frags_sysctl_register(); + if (ret) + goto err_sysctl; + + ret = register_pernet_subsys(&lowpan_frags_ops); + if (ret) + goto err_pernet; +out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); +err_sysctl: + inet_frags_fini(&lowpan_frags); return ret; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e8ec28999f5c..c9e35b81d093 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -25,12 +25,6 @@ #include <net/inet_frag.h> #include <net/inet_ecn.h> -#define INETFRAGS_EVICT_BUCKETS 128 -#define INETFRAGS_EVICT_MAX 512 - -/* don't rebuild inetfrag table with new secret more often than this */ -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) - /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { }; EXPORT_SYMBOL(ip_frag_ecn_table); -static unsigned int -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) -{ - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); -} - -static bool inet_frag_may_rebuild(struct inet_frags *f) -{ - return time_after(jiffies, - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); -} - -static void inet_frag_secret_rebuild(struct inet_frags *f) -{ - int i; - - write_seqlock_bh(&f->rnd_seqlock); - - if (!inet_frag_may_rebuild(f)) - goto out; - - get_random_bytes(&f->rnd, sizeof(u32)); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - struct hlist_node *n; - - hb = &f->hash[i]; - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(q, n, &hb->chain, list) { - unsigned int hval = inet_frag_hashfn(f, q); - - if (hval != i) { - struct inet_frag_bucket *hb_dest; - - hlist_del(&q->list); - - /* Relink to new hash chain. */ - hb_dest = &f->hash[hval]; - - /* This is the only place where we take - * another chain_lock while already holding - * one. As this will not run concurrently, - * we cannot deadlock on hb_dest lock below, if its - * already locked it will be released soon since - * other caller cannot be waiting for hb lock - * that we've taken above. - */ - spin_lock_nested(&hb_dest->chain_lock, - SINGLE_DEPTH_NESTING); - hlist_add_head(&q->list, &hb_dest->chain); - spin_unlock(&hb_dest->chain_lock); - } - } - spin_unlock(&hb->chain_lock); - } - - f->rebuild = false; - f->last_rebuild_jiffies = jiffies; -out: - write_sequnlock_bh(&f->rnd_seqlock); -} - -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) -{ - if (!hlist_unhashed(&q->list_evictor)) - return false; - - return q->net->low_thresh == 0 || - frag_mem_limit(q->net) >= q->net->low_thresh; -} - -static unsigned int -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) -{ - struct inet_frag_queue *fq; - struct hlist_node *n; - unsigned int evicted = 0; - HLIST_HEAD(expired); - - spin_lock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { - if (!inet_fragq_should_evict(fq)) - continue; - - if (!del_timer(&fq->timer)) - continue; - - hlist_add_head(&fq->list_evictor, &expired); - ++evicted; - } - - spin_unlock(&hb->chain_lock); - - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) - f->frag_expire(&fq->timer); - - return evicted; -} - -static void inet_frag_worker(struct work_struct *work) -{ - unsigned int budget = INETFRAGS_EVICT_BUCKETS; - unsigned int i, evicted = 0; - struct inet_frags *f; - - f = container_of(work, struct inet_frags, frags_work); - - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); - - local_bh_disable(); - - for (i = READ_ONCE(f->next_bucket); budget; --budget) { - evicted += inet_evict_bucket(f, &f->hash[i]); - i = (i + 1) & (INETFRAGS_HASHSZ - 1); - if (evicted > INETFRAGS_EVICT_MAX) - break; - } - - f->next_bucket = i; - - local_bh_enable(); - - if (f->rebuild && inet_frag_may_rebuild(f)) - inet_frag_secret_rebuild(f); -} - -static void inet_frag_schedule_worker(struct inet_frags *f) -{ - if (unlikely(!work_pending(&f->frags_work))) - schedule_work(&f->frags_work); -} - int inet_frags_init(struct inet_frags *f) { - int i; - - INIT_WORK(&f->frags_work, inet_frag_worker); - - for (i = 0; i < INETFRAGS_HASHSZ; i++) { - struct inet_frag_bucket *hb = &f->hash[i]; - - spin_lock_init(&hb->chain_lock); - INIT_HLIST_HEAD(&hb->chain); - } - - seqlock_init(&f->rnd_seqlock); - f->last_rebuild_jiffies = 0; f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) @@ -214,83 +59,75 @@ EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { - cancel_work_sync(&f->frags_work); + /* We must wait that all inet_frag_destroy_rcu() have completed. */ + rcu_barrier(); + kmem_cache_destroy(f->frags_cachep); + f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); -void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) +static void inet_frags_free_cb(void *ptr, void *arg) { - unsigned int seq; - int i; - - nf->low_thresh = 0; + struct inet_frag_queue *fq = ptr; -evict_again: - local_bh_disable(); - seq = read_seqbegin(&f->rnd_seqlock); - - for (i = 0; i < INETFRAGS_HASHSZ ; i++) - inet_evict_bucket(f, &f->hash[i]); - - local_bh_enable(); - cond_resched(); - - if (read_seqretry(&f->rnd_seqlock, seq) || - sum_frag_mem_limit(nf)) - goto evict_again; -} -EXPORT_SYMBOL(inet_frags_exit_net); - -static struct inet_frag_bucket * -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) -__acquires(hb->chain_lock) -{ - struct inet_frag_bucket *hb; - unsigned int seq, hash; - - restart: - seq = read_seqbegin(&f->rnd_seqlock); - - hash = inet_frag_hashfn(f, fq); - hb = &f->hash[hash]; + /* If we can not cancel the timer, it means this frag_queue + * is already disappearing, we have nothing to do. + * Otherwise, we own a refcount until the end of this function. + */ + if (!del_timer(&fq->timer)) + return; - spin_lock(&hb->chain_lock); - if (read_seqretry(&f->rnd_seqlock, seq)) { - spin_unlock(&hb->chain_lock); - goto restart; + spin_lock_bh(&fq->lock); + if (!(fq->flags & INET_FRAG_COMPLETE)) { + fq->flags |= INET_FRAG_COMPLETE; + refcount_dec(&fq->refcnt); } + spin_unlock_bh(&fq->lock); - return hb; + inet_frag_put(fq); } -static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frags_exit_net(struct netns_frags *nf) { - struct inet_frag_bucket *hb; + nf->low_thresh = 0; /* prevent creation of new frags */ - hb = get_frag_bucket_locked(fq, f); - hlist_del(&fq->list); - fq->flags |= INET_FRAG_COMPLETE; - spin_unlock(&hb->chain_lock); + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } +EXPORT_SYMBOL(inet_frags_exit_net); -void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f) +void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { - fq_unlink(fq, f); + struct netns_frags *nf = fq->net; + + fq->flags |= INET_FRAG_COMPLETE; + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); refcount_dec(&fq->refcnt); } } EXPORT_SYMBOL(inet_frag_kill); -void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) +static void inet_frag_destroy_rcu(struct rcu_head *head) +{ + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, + rcu); + struct inet_frags *f = q->net->f; + + if (f->destructor) + f->destructor(q); + kmem_cache_free(f->frags_cachep, q); +} + +void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; struct netns_frags *nf; unsigned int sum, sum_truesize = 0; + struct inet_frags *f; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); WARN_ON(del_timer(&q->timer) != 0); @@ -298,6 +135,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) /* Release all fragment data. */ fp = q->fragments; nf = q->net; + f = nf->f; while (fp) { struct sk_buff *xp = fp->next; @@ -307,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f) } sum = sum_truesize + f->qsize; - if (f->destructor) - f->destructor(q); - kmem_cache_free(f->frags_cachep, q); + call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(nf, sum); } EXPORT_SYMBOL(inet_frag_destroy); -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, - struct inet_frag_queue *qp_in, - struct inet_frags *f, - void *arg) -{ - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); - struct inet_frag_queue *qp; - -#ifdef CONFIG_SMP - /* With SMP race we have to recheck hash table, because - * such entry could have been created on other cpu before - * we acquired hash bucket lock. - */ - hlist_for_each_entry(qp, &hb->chain, list) { - if (qp->net == nf && f->match(qp, arg)) { - refcount_inc(&qp->refcnt); - spin_unlock(&hb->chain_lock); - qp_in->flags |= INET_FRAG_COMPLETE; - inet_frag_put(qp_in, f); - return qp; - } - } -#endif - qp = qp_in; - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) - refcount_inc(&qp->refcnt); - - refcount_inc(&qp->refcnt); - hlist_add_head(&qp->list, &hb->chain); - - spin_unlock(&hb->chain_lock); - - return qp; -} - static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { - inet_frag_schedule_worker(f); + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) return NULL; - } q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) @@ -371,70 +170,51 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 1); + refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, - struct inet_frags *f, void *arg) { + struct inet_frags *f = nf->f; struct inet_frag_queue *q; + int err; q = inet_frag_alloc(nf, f, arg); if (!q) return NULL; - return inet_frag_intern(nf, q, f, arg); + mod_timer(&q->timer, jiffies + nf->timeout); + + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, + f->rhash_params); + if (err < 0) { + q->flags |= INET_FRAG_COMPLETE; + inet_frag_kill(q); + inet_frag_destroy(q); + return NULL; + } + return q; } -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, - unsigned int hash) +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) { - struct inet_frag_bucket *hb; - struct inet_frag_queue *q; - int depth = 0; - - if (frag_mem_limit(nf) > nf->low_thresh) - inet_frag_schedule_worker(f); - - hash &= (INETFRAGS_HASHSZ - 1); - hb = &f->hash[hash]; - - spin_lock(&hb->chain_lock); - hlist_for_each_entry(q, &hb->chain, list) { - if (q->net == nf && f->match(q, key)) { - refcount_inc(&q->refcnt); - spin_unlock(&hb->chain_lock); - return q; - } - depth++; - } - spin_unlock(&hb->chain_lock); + struct inet_frag_queue *fq; - if (depth <= INETFRAGS_MAXDEPTH) - return inet_frag_create(nf, f, key); + rcu_read_lock(); - if (inet_frag_may_rebuild(f)) { - if (!f->rebuild) - f->rebuild = true; - inet_frag_schedule_worker(f); + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); + if (fq) { + if (!refcount_inc_not_zero(&fq->refcnt)) + fq = NULL; + rcu_read_unlock(); + return fq; } + rcu_read_unlock(); - return ERR_PTR(-ENOBUFS); + return inet_frag_create(nf, key); } EXPORT_SYMBOL(inet_frag_find); - -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, - const char *prefix) -{ - static const char msg[] = "inet_frag_find: Fragment hash bucket" - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) - ". Dropping fragment.\n"; - - if (PTR_ERR(q) == -ENOBUFS) - net_dbg_ratelimited("%s%s", prefix, msg); -} -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index bbf1b94942c0..994fa70a910f 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,27 +57,13 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -struct ipfrag_skb_cb -{ - struct inet_skb_parm h; - int offset; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; - int vif; /* L3 master device index */ unsigned int rid; struct inet_peer *peer; }; @@ -89,49 +75,9 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; -int ip_frag_mem(struct net *net) -{ - return sum_frag_mem_limit(&net->ipv4.frags); -} - static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev); -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} - -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { @@ -140,17 +86,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) frags); struct net *net = container_of(ipv4, struct net, ipv4); - const struct ip4_create_arg *arg = a; + const struct frag_v4_compare_key *key = a; - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; + q->key.v4 = *key; + qp->ecn = 0; qp->peer = q->net->max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : NULL; } @@ -168,7 +109,7 @@ static void ip4_frag_free(struct inet_frag_queue *q) static void ipq_put(struct ipq *ipq) { - inet_frag_put(&ipq->q, &ip4_frags); + inet_frag_put(&ipq->q); } /* Kill ipq entry. It is not destroyed immediately, @@ -176,7 +117,7 @@ static void ipq_put(struct ipq *ipq) */ static void ipq_kill(struct ipq *ipq) { - inet_frag_kill(&ipq->q, &ip4_frags); + inet_frag_kill(&ipq->q); } static bool frag_expire_skip_icmp(u32 user) @@ -194,8 +135,11 @@ static bool frag_expire_skip_icmp(u32 user) static void ip_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); - struct ipq *qp; + const struct iphdr *iph; + struct sk_buff *head; struct net *net; + struct ipq *qp; + int err; qp = container_of(frag, struct ipq, q); net = container_of(qp->q.net, struct net, ipv4.frags); @@ -209,46 +153,38 @@ static void ip_expire(struct timer_list *t) ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); - if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *clone, *head = qp->q.fragments; - const struct iphdr *iph; - int err; + head = qp->q.fragments; - __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) - goto out; + if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out; + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out; - /* skb has no dst, perform route lookup again */ - iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, + /* skb has no dst, perform route lookup again */ + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, iph->tos, head->dev); - if (err) - goto out; + if (err) + goto out; + + /* Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (frag_expire_skip_icmp(qp->q.key.v4.user) && + (skb_rtable(head)->rt_type != RTN_LOCAL)) + goto out; + + skb_get(head); + spin_unlock(&qp->q.lock); + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; - /* Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (frag_expire_skip_icmp(qp->user) && - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out; - - clone = skb_clone(head, GFP_ATOMIC); - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if (clone) { - spin_unlock(&qp->q.lock); - icmp_send(clone, ICMP_TIME_EXCEEDED, - ICMP_EXC_FRAGTIME, 0); - consume_skb(clone); - goto out_rcu_unlock; - } - } out: spin_unlock(&qp->q.lock); out_rcu_unlock: @@ -262,21 +198,20 @@ out_rcu_unlock: static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user, int vif) { + struct frag_v4_compare_key key = { + .saddr = iph->saddr, + .daddr = iph->daddr, + .user = user, + .vif = vif, + .id = iph->id, + .protocol = iph->protocol, + }; struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&net->ipv4.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct ipq, q); } @@ -410,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * this fragment, right? */ prev = qp->q.fragments_tail; - if (!prev || FRAG_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = qp->q.fragments; next != NULL; next = next->next) { - if (FRAG_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -427,7 +362,7 @@ found: * any overlaps are eliminated. */ if (prev) { - int i = (FRAG_CB(prev)->offset + prev->len) - offset; + int i = (prev->ip_defrag_offset + prev->len) - offset; if (i > 0) { offset += i; @@ -444,8 +379,8 @@ found: err = -ENOMEM; - while (next && FRAG_CB(next)->offset < end) { - int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + while (next && next->ip_defrag_offset < end) { + int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */ if (i < next->len) { /* Eat head of the next overlapped fragment @@ -453,7 +388,7 @@ found: */ if (!pskb_pull(next, i)) goto err; - FRAG_CB(next)->offset += i; + next->ip_defrag_offset += i; qp->q.meat -= i; if (next->ip_summed != CHECKSUM_UNNECESSARY) next->ip_summed = CHECKSUM_NONE; @@ -477,7 +412,13 @@ found: } } - FRAG_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + qp->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -488,11 +429,6 @@ found: else qp->q.fragments = skb; - dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; qp->ecn |= ecn; @@ -568,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, } WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); @@ -656,7 +592,7 @@ out_nomem: err = -ENOMEM; goto out_fail; out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); out_fail: __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; @@ -731,23 +667,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) EXPORT_SYMBOL(ip_check_defrag); #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv4.frags.low_thresh }, { .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.ipv4.frags.high_thresh }, @@ -846,6 +782,8 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { + int res; + /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -870,16 +808,21 @@ static int __net_init ipv4_frags_init_net(struct net *net) net->ipv4.frags.timeout = IP_FRAG_TIME; net->ipv4.frags.max_dist = 64; - - inet_frags_init_net(&net->ipv4.frags); - - return ip4_frags_ns_ctl_register(net); + net->ipv4.frags.f = &ip4_frags; + + res = inet_frags_init_net(&net->ipv4.frags); + if (res < 0) + return res; + res = ip4_frags_ns_ctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv4.frags); + return res; } static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); - inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); + inet_frags_exit_net(&net->ipv4.frags); } static struct pernet_operations ip4_frags_ops = { @@ -887,17 +830,49 @@ static struct pernet_operations ip4_frags_ops = { .exit = ipv4_frags_exit_net, }; + +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v4, + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); +} + +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v4_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static const struct rhashtable_params ip4_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .key_offset = offsetof(struct inet_frag_queue, key), + .key_len = sizeof(struct frag_v4_compare_key), + .hashfn = ip4_key_hashfn, + .obj_hashfn = ip4_obj_hashfn, + .obj_cmpfn = ip4_obj_cmpfn, + .automatic_shrinking = true, +}; + void __init ipfrag_init(void) { - ip4_frags_ctl_register(); - register_pernet_subsys(&ip4_frags_ops); - ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; ip4_frags.frags_cache_name = ip_frag_cache_name; + ip4_frags.rhash_params = ip4_rhash_params; if (inet_frags_init(&ip4_frags)) panic("IP: failed to allocate ip4_frags cache\n"); + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index adfb75340275..a058de677e94 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -54,7 +54,6 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem; int orphans, sockets; orphans = percpu_counter_sum_positive(&tcp_orphan_count); @@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplite_prot)); seq_printf(seq, "RAW: inuse %d\n", sock_prot_inuse_get(net, &raw_prot)); - frag_mem = ip_frag_mem(net); - seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG: inuse %u memory %lu\n", + atomic_read(&net->ipv4.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv4.frags)); return 0; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b84ce3e6d728..3622aac343ae 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -52,18 +52,10 @@ static const char nf_frags_cache_name[] = "nf-frags"; -struct nf_ct_frag6_skb_cb -{ - struct inet6_skb_parm h; - int offset; -}; - -#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) - static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL -static int zero; +static long zero; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { @@ -76,18 +68,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_low_thresh", .data = &init_net.nf_frag.frags.low_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &zero, .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", .data = &init_net.nf_frag.frags.high_thresh, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.nf_frag.frags.low_thresh }, { } @@ -152,23 +144,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); } -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); -} - - -static unsigned int nf_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *nq; - - nq = container_of(q, struct frag_queue, q); - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); -} - static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -178,34 +153,26 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq, &nf_frags); + ip6_expire_frag_queue(net, fq); } /* Creation primitives. */ -static inline struct frag_queue *fq_find(struct net *net, __be32 id, - u32 user, struct in6_addr *src, - struct in6_addr *dst, int iif, u8 ecn) +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, + const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = user, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = user; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - - local_bh_disable(); - hash = nf_hash_frag(id, src, dst); - - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); - local_bh_enable(); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + + q = inet_frag_find(&net->nf_frag.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } @@ -264,7 +231,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); return -EPROTO; } if (end > fq->q.len) { @@ -295,13 +262,13 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (NFCT_FRAG6_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -317,14 +284,19 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) + (prev->ip_defrag_offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ - if (next && NFCT_FRAG6_CB(next)->offset < end) + if (next && next->ip_defrag_offset < end) goto discard_fq; - NFCT_FRAG6_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + if (skb->dev) + fq->iif = skb->dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -335,10 +307,6 @@ found: else fq->q.fragments = skb; - if (skb->dev) { - fq->iif = skb->dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -357,7 +325,7 @@ found: return 0; discard_fq: - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); err: return -EINVAL; } @@ -379,10 +347,10 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic int payload_len; u8 ecn; - inet_frag_kill(&fq->q, &nf_frags); + inet_frag_kill(&fq->q); WARN_ON(head == NULL); - WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -593,8 +561,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fhdr = (struct frag_hdr *)skb_transport_header(skb); skb_orphan(skb); - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + fq = fq_find(net, fhdr->identification, user, hdr, + skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; @@ -622,25 +590,33 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) out_unlock: spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q, &nf_frags); + inet_frag_put(&fq->q); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { + int res; + net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - inet_frags_init_net(&net->nf_frag.frags); - - return nf_ct_frag6_sysctl_register(net); + net->nf_frag.frags.f = &nf_frags; + + res = inet_frags_init_net(&net->nf_frag.frags); + if (res < 0) + return res; + res = nf_ct_frag6_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->nf_frag.frags); + return res; } static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags, &nf_frags); + inet_frags_exit_net(&net->nf_frag.frags); } static struct pernet_operations nf_ct_net_ops = { @@ -652,13 +628,12 @@ int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.hashfn = nf_hashfn; nf_frags.constructor = ip6_frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); - nf_frags.match = ip6_frag_match; nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; + nf_frags.rhash_params = ip6_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 6e57028d2e91..a85f7e0b14b1 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -38,7 +38,6 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; - unsigned int frag_mem = ip6_frag_mem(net); seq_printf(seq, "TCP6: inuse %d\n", sock_prot_inuse_get(net, &tcpv6_prot)); @@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) sock_prot_inuse_get(net, &udplitev6_prot)); seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); - seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem); + seq_printf(seq, "FRAG6: inuse %u memory %lu\n", + atomic_read(&net->ipv6.frags.rhashtable.nelems), + frag_mem_limit(&net->ipv6.frags)); return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 08a139f14d0f..7b52efc63f6a 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -62,13 +62,6 @@ static const char ip6_frag_cache_name[] = "ip6-frags"; -struct ip6frag_skb_cb { - struct inet6_skb_parm h; - int offset; -}; - -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) - static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); @@ -79,94 +72,58 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -/* - * callers should be careful not to use the hash value outside the ipfrag_lock - * as doing so could race with ipfrag_hash_rnd being recalculated. - */ -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, ip6_frags.rnd); -} - -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) -{ - const struct frag_queue *fq; - - fq = container_of(q, struct frag_queue, q); - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); -} - -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) -{ - const struct frag_queue *fq; - const struct ip6_create_arg *arg = a; - - fq = container_of(q, struct frag_queue, q); - return fq->id == arg->id && - fq->user == arg->user && - ipv6_addr_equal(&fq->saddr, arg->src) && - ipv6_addr_equal(&fq->daddr, arg->dst) && - (arg->iif == fq->iif || - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | - IPV6_ADDR_LINKLOCAL))); -} -EXPORT_SYMBOL(ip6_frag_match); - void ip6_frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct ip6_create_arg *arg = a; + const struct frag_v6_compare_key *key = a; - fq->id = arg->id; - fq->user = arg->user; - fq->saddr = *arg->src; - fq->daddr = *arg->dst; - fq->ecn = arg->ecn; + q->key.v6 = *key; + fq->ecn = 0; } EXPORT_SYMBOL(ip6_frag_init); -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, - struct inet_frags *frags) +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; + struct sk_buff *head; + rcu_read_lock(); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q, frags); + inet_frag_kill(&fq->q); - rcu_read_lock(); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) - goto out_rcu_unlock; + goto out; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - - if (inet_frag_evicting(&fq->q)) - goto out_rcu_unlock; - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) - goto out_rcu_unlock; + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; /* But use as source device on which LAST ARRIVED * segment was received. And do not use fq->dev * pointer directly, device might already disappeared. */ - fq->q.fragments->dev = dev; - icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); -out_rcu_unlock: - rcu_read_unlock(); + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, frags); +out_rcu_unlock: + rcu_read_unlock(); + inet_frag_put(&fq->q); } EXPORT_SYMBOL(ip6_expire_frag_queue); @@ -179,31 +136,29 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq, &ip6_frags); + ip6_expire_frag_queue(net, fq); } static struct frag_queue * -fq_find(struct net *net, __be32 id, const struct in6_addr *src, - const struct in6_addr *dst, int iif, u8 ecn) +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) { + struct frag_v6_compare_key key = { + .id = id, + .saddr = hdr->saddr, + .daddr = hdr->daddr, + .user = IP6_DEFRAG_LOCAL_DELIVER, + .iif = iif, + }; struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - - arg.id = id; - arg.user = IP6_DEFRAG_LOCAL_DELIVER; - arg.src = src; - arg.dst = dst; - arg.iif = iif; - arg.ecn = ecn; - hash = inet6_hash_frag(id, src, dst); + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | + IPV6_ADDR_LINKLOCAL))) + key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); + q = inet_frag_find(&net->ipv6.frags, &key); + if (!q) return NULL; - } + return container_of(q, struct frag_queue, q); } @@ -288,13 +243,13 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, * this fragment, right? */ prev = fq->q.fragments_tail; - if (!prev || FRAG6_CB(prev)->offset < offset) { + if (!prev || prev->ip_defrag_offset < offset) { next = NULL; goto found; } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { - if (FRAG6_CB(next)->offset >= offset) + if (next->ip_defrag_offset >= offset) break; /* bingo! */ prev = next; } @@ -309,14 +264,20 @@ found: /* Check for overlap with preceding fragment. */ if (prev && - (FRAG6_CB(prev)->offset + prev->len) > offset) + (prev->ip_defrag_offset + prev->len) > offset) goto discard_fq; /* Look for overlap with succeeding segment. */ - if (next && FRAG6_CB(next)->offset < end) + if (next && next->ip_defrag_offset < end) goto discard_fq; - FRAG6_CB(skb)->offset = offset; + /* Note : skb->ip_defrag_offset and skb->dev share the same location */ + dev = skb->dev; + if (dev) + fq->iif = dev->ifindex; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + skb->ip_defrag_offset = offset; /* Insert this fragment in the chain of fragments. */ skb->next = next; @@ -327,11 +288,6 @@ found: else fq->q.fragments = skb; - dev = skb->dev; - if (dev) { - fq->iif = dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -364,7 +320,7 @@ found: return -1; discard_fq: - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); err: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); @@ -391,7 +347,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, int sum_truesize; u8 ecn; - inet_frag_kill(&fq->q, &ip6_frags); + inet_frag_kill(&fq->q); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -418,7 +374,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, } WARN_ON(head == NULL); - WARN_ON(FRAG6_CB(head)->offset != 0); + WARN_ON(head->ip_defrag_offset != 0); /* Unfragmented part is taken from the first segment. */ payload_len = ((head->data - skb_network_header(head)) - @@ -531,6 +487,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -559,17 +516,18 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + iif = skb->dev ? skb->dev->ifindex : 0; + fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { int ret; spin_lock(&fq->q.lock); + fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q, &ip6_frags); + inet_frag_put(&fq->q); return ret; } @@ -596,15 +554,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", .data = &init_net.ipv6.frags.high_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_doulongvec_minmax, .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", .data = &init_net.ipv6.frags.low_thresh, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, @@ -711,19 +669,27 @@ static void ip6_frags_sysctl_unregister(void) static int __net_init ipv6_frags_init_net(struct net *net) { + int res; + net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; + net->ipv6.frags.f = &ip6_frags; - inet_frags_init_net(&net->ipv6.frags); + res = inet_frags_init_net(&net->ipv6.frags); + if (res < 0) + return res; - return ip6_frags_ns_sysctl_register(net); + res = ip6_frags_ns_sysctl_register(net); + if (res < 0) + inet_frags_exit_net(&net->ipv6.frags); + return res; } static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags, &ip6_frags); + inet_frags_exit_net(&net->ipv6.frags); } static struct pernet_operations ip6_frags_ops = { @@ -731,14 +697,55 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +const struct rhashtable_params ip6_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6_key_hashfn, + .obj_hashfn = ip6_obj_hashfn, + .obj_cmpfn = ip6_obj_cmpfn, + .automatic_shrinking = true, +}; +EXPORT_SYMBOL(ip6_rhash_params); + int __init ipv6_frag_init(void) { int ret; - ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + ip6_frags.constructor = ip6_frag_init; + ip6_frags.destructor = NULL; + ip6_frags.qsize = sizeof(struct frag_queue); + ip6_frags.frag_expire = ip6_frag_expire; + ip6_frags.frags_cache_name = ip6_frag_cache_name; + ip6_frags.rhash_params = ip6_rhash_params; + ret = inet_frags_init(&ip6_frags); if (ret) goto out; + ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT); + if (ret) + goto err_protocol; + ret = ip6_frags_sysctl_register(); if (ret) goto err_sysctl; @@ -747,16 +754,6 @@ int __init ipv6_frag_init(void) if (ret) goto err_pernet; - ip6_frags.hashfn = ip6_hashfn; - ip6_frags.constructor = ip6_frag_init; - ip6_frags.destructor = NULL; - ip6_frags.qsize = sizeof(struct frag_queue); - ip6_frags.match = ip6_frag_match; - ip6_frags.frag_expire = ip6_frag_expire; - ip6_frags.frags_cache_name = ip6_frag_cache_name; - ret = inet_frags_init(&ip6_frags); - if (ret) - goto err_pernet; out: return ret; @@ -764,6 +761,8 @@ err_pernet: ip6_frags_sysctl_unregister(); err_sysctl: inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); +err_protocol: + inet_frags_fini(&ip6_frags); goto out; } |