From 6fbabb20faed9c08f8b96de4182bd721cbd1cfcf Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 8 Aug 2011 11:16:56 -0500 Subject: slub: Fix full list corruption if debugging is on When a slab is freed by __slab_free() and the slab can only contain a single object ever then it was full (and therefore not on the partial lists but on the full list in the debug case) before we reached slab_empty. This caused the following full list corruption when SLUB debugging was enabled: [ 5913.233035] ------------[ cut here ]------------ [ 5913.233097] WARNING: at lib/list_debug.c:53 __list_del_entry+0x8d/0x98() [ 5913.233101] Hardware name: Adamo 13 [ 5913.233105] list_del corruption. prev->next should be ffffea000434fd20, but was ffffea0004199520 [ 5913.233108] Modules linked in: nfs fscache fuse ebtable_nat ebtables ppdev parport_pc lp parport ipt_MASQUERADE iptable_nat nf_nat nfsd lockd nfs_acl auth_rpcgss xt_CHECKSUM sunrpc iptable_mangle bridge stp llc cpufreq_ondemand acpi_cpufreq freq_table mperf ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables rfcomm bnep arc4 iwlagn snd_hda_codec_hdmi snd_hda_codec_idt snd_hda_intel btusb mac80211 snd_hda_codec bluetooth snd_hwdep snd_seq snd_seq_device snd_pcm usb_debug dell_wmi sparse_keymap cdc_ether usbnet cdc_acm uvcvideo cdc_wdm mii cfg80211 snd_timer dell_laptop videodev dcdbas snd microcode v4l2_compat_ioctl32 soundcore joydev tg3 pcspkr snd_page_alloc iTCO_wdt i2c_i801 rfkill iTCO_vendor_support wmi virtio_net kvm_intel kvm ipv6 xts gf128mul dm_crypt i915 drm_kms_helper drm i2c_algo_bit i2c_core video [last unloaded: scsi_wait_scan] [ 5913.233213] Pid: 0, comm: swapper Not tainted 3.0.0+ #127 [ 5913.233213] Call Trace: [ 5913.233213] [] warn_slowpath_common+0x83/0x9b [ 5913.233213] [] warn_slowpath_fmt+0x46/0x48 [ 5913.233213] [] __list_del_entry+0x8d/0x98 [ 5913.233213] [] list_del+0xe/0x2d [ 5913.233213] [] __slab_free+0x1db/0x235 [ 5913.233213] [] ? bvec_free_bs+0x35/0x37 [ 5913.233213] [] ? bvec_free_bs+0x35/0x37 [ 5913.233213] [] ? bvec_free_bs+0x35/0x37 [ 5913.233213] [] kmem_cache_free+0x88/0x102 [ 5913.233213] [] bvec_free_bs+0x35/0x37 [ 5913.233213] [] bio_free+0x34/0x64 [ 5913.233213] [] dm_bio_destructor+0x12/0x14 [ 5913.233213] [] bio_put+0x2b/0x2d [ 5913.233213] [] clone_endio+0x9e/0xb4 [ 5913.233213] [] bio_endio+0x2d/0x2f [ 5913.233213] [] crypt_dec_pending+0x5c/0x8b [dm_crypt] [ 5913.233213] [] crypt_endio+0x78/0x81 [dm_crypt] [ Full discussion here: https://lkml.org/lkml/2011/8/4/375 ] Make sure that we remove such a slab also from the full lists. Reported-and-tested-by: Dave Jones Reported-and-tested-by: Xiaotian Feng Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index eb5a8f93338a..5436fe2fbf9c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2387,11 +2387,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } + } else + /* Slab must be on the full list */ + remove_full(s, page); spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); -- cgit v1.2.3 From ef62fb32b7b21731e41aea3c1e08bcdb407c9eb9 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 7 Aug 2011 18:30:38 +0900 Subject: slub: fix check_bytes() for slub debugging The check_bytes() function is used by slub debugging. It returns a pointer to the first unmatching byte for a character in the given memory area. If the character for matching byte is greater than 0x80, check_bytes() doesn't work. Becuase 64-bit pattern is generated as below. value64 = value | value << 8 | value << 16 | value << 24; value64 = value64 | value64 << 32; The integer promotions are performed and sign-extended as the type of value is u8. The upper 32 bits of value64 is 0xffffffff in the first line, and the second line has no effect. This fixes the 64-bit pattern generation. Signed-off-by: Akinobu Mita Cc: Christoph Lameter Cc: Matt Mackall Reviewed-by: Marcin Slusarz Acked-by: Eric Dumazet Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 5436fe2fbf9c..6da68597bde8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -701,7 +701,7 @@ static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) return check_bytes8(start, value, bytes); value64 = value | value << 8 | value << 16 | value << 24; - value64 = value64 | value64 << 32; + value64 = (value64 & 0xffffffff) | value64 << 32; prefix = 8 - ((unsigned long)start) % 8; if (prefix) { -- cgit v1.2.3 From 81107188f123e3c2217ac2f2feb2a1147904c62f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 9 Aug 2011 13:01:32 -0500 Subject: slub: Fix partial count comparison confusion deactivate_slab() has the comparison if more than the minimum number of partial pages are in the partial list wrong. An effect of this may be that empty pages are not freed from deactivate_slab(). The result could be an OOM due to growth of the partial slabs per node. Frees mostly occur from __slab_free which is okay so this would only affect use cases where a lot of switching around of per cpu slabs occur. Switching per cpu slabs occurs with high frequency if debugging options are enabled. Reported-and-tested-by: Xiaotian Feng Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 6da68597bde8..9f662d70eb47 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1854,7 +1854,7 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial < s->min_partial) + if (!new.inuse && n->nr_partial > s->min_partial) m = M_FREE; else if (new.freelist) { m = M_PARTIAL; -- cgit v1.2.3 From 130655ef097940b627e8e04fa7c6f3b51cf24f85 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 23 Aug 2011 08:36:59 +0800 Subject: slub: add slab with one free object to partial list tail The slab has just one free object, adding it to partial list head doesn't make sense. And it can cause lock contentation. For example, 1. CPU takes the slab from partial list 2. fetch an object 3. switch to another slab 4. free an object, then the slab is added to partial list again In this way n->list_lock will be heavily contended. In fact, Alex had a hackbench regression. 3.1-rc1 performance drops about 70% against 3.0. This patch fixes it. Acked-by: Christoph Lameter Reported-by: Alex Shi Signed-off-by: Shaohua Li Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 9f662d70eb47..7c54fe83a90c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2377,7 +2377,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (unlikely(!prior)) { remove_full(s, page); - add_partial(n, page, 0); + add_partial(n, page, 1); stat(s, FREE_ADD_PARTIAL); } } -- cgit v1.2.3 From 136333d104bd3a62d783b0ac3d0f32ac0108c5d0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 24 Aug 2011 08:57:52 +0800 Subject: slub: explicitly document position of inserting slab to partial list Adding slab to partial list head/tail is sensitive to performance. So explicitly uses DEACTIVATE_TO_TAIL/DEACTIVATE_TO_HEAD to document it to avoid we get it wrong. Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/slub.c') diff --git a/mm/slub.c b/mm/slub.c index 7c54fe83a90c..91a120f185d1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1534,7 +1534,7 @@ static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); @@ -1781,13 +1781,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) enum slab_modes l = M_NONE, m = M_NONE; void *freelist; void *nextfree; - int tail = 0; + int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); - tail = 1; + tail = DEACTIVATE_TO_TAIL; } c->tid = next_tid(c->tid); @@ -1893,7 +1893,7 @@ redo: if (m == M_PARTIAL) { add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail); } else if (m == M_FULL) { @@ -2377,7 +2377,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (unlikely(!prior)) { remove_full(s, page); - add_partial(n, page, 1); + add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } @@ -2695,7 +2695,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) -- cgit v1.2.3