summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/fault.c34
-rw-r--r--arch/x86/mm/gup.c67
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init.c20
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c15
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c228
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c640
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c69
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c162
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h16
-rw-r--r--arch/x86/mm/pageattr.c106
-rw-r--r--arch/x86/mm/pgtable.c19
-rw-r--r--arch/x86/mm/srat_64.c6
-rw-r--r--arch/x86/mm/tlb.c21
23 files changed, 1481 insertions, 96 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fdd30d08ab52..eefdeee8a871 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
+obj-$(CONFIG_KMEMCHECK) += kmemcheck/
+
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c6acc6326374..bfae139182ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -14,6 +14,7 @@
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
+#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
/*
* Page fault error code bits:
@@ -425,10 +426,11 @@ static noinline int vmalloc_fault(unsigned long address)
}
static const char errata93_warning[] =
-KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
-KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n"
-KERN_ERR "******* Please consider a BIOS update.\n"
-KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n";
+KERN_ERR
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
/*
* No vm86 mode in 64-bit mode:
@@ -695,7 +697,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
if (!printk_ratelimit())
return;
- printk(KERN_CONT "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+ printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address,
(void *)regs->ip, (void *)regs->sp, error_code);
@@ -951,11 +953,17 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
tsk = current;
mm = tsk->mm;
- prefetchw(&mm->mmap_sem);
-
/* Get the faulting address: */
address = read_cr2();
+ /*
+ * Detect and handle instructions that would cause a page fault for
+ * both a tracked kernel page and a userspace page.
+ */
+ if (kmemcheck_active(regs))
+ kmemcheck_hide(regs);
+ prefetchw(&mm->mmap_sem);
+
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -973,9 +981,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) {
- if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
- vmalloc_fault(address) >= 0)
- return;
+ if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+ if (vmalloc_fault(address) >= 0)
+ return;
+
+ if (kmemcheck_fault(regs, address, error_code))
+ return;
+ }
/* Can handle a stale RO->RW TLB: */
if (spurious_fault(error_code, address))
@@ -1102,7 +1114,7 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/
- fault = handle_mm_fault(mm, vma, address, write);
+ fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) {
mm_fault_error(regs, error_code, address, fault);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 6340cef6798a..71da1bca13cb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -14,7 +14,7 @@
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
- return *ptep;
+ return ACCESS_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
@@ -219,6 +219,62 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ unsigned long flags;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ (void __user *)start, len)))
+ return 0;
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables and pages from being freed on x86.
+ *
+ * So long as we atomically load page table pointers versus teardown
+ * (which we do on x86, with the above PAE exception), we can follow the
+ * address down to the the page and take a ref on it.
+ */
+ local_irq_save(flags);
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ break;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ break;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_restore(flags);
+
+ return nr;
+}
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -247,11 +303,16 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
+
end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
+ if (end < start)
goto slow_irqon;
+#ifdef CONFIG_X86_64
+ if (end >> __VIRTUAL_MASK_SHIFT)
+ goto slow_irqon;
+#endif
+
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 58f621e81919..2112ed55e7ea 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmap);
EXPORT_SYMBOL(kunmap);
EXPORT_SYMBOL(kmap_atomic);
EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_prot);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 34c1bfb64f1c..0607119cef94 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -12,6 +12,7 @@
#include <asm/system.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
+#include <asm/proto.h>
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -177,20 +178,6 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
return nr_range;
}
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
- if (direct_gbpages && cpu_has_gbpages)
- printk(KERN_INFO "Using GB pages for direct mapping\n");
- else
- direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
-#endif
-
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
@@ -210,10 +197,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
- if (!after_bootmem)
- init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9ff3c0816d15..3cd7711bb949 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -111,7 +111,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
pte_t *page_table = NULL;
if (after_bootmem) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
#endif
if (!page_table)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 52bb9519bb86..6176fe8f29e0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -104,7 +104,7 @@ static __ref void *spp_getpage(void)
void *ptr;
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -281,7 +281,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
void *adr;
if (after_bootmem) {
- adr = (void *)get_zeroed_page(GFP_ATOMIC);
+ adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
*phys = __pa(adr);
return adr;
@@ -527,7 +527,7 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
return phys_pud_init(pud, addr, end, page_size_mask);
}
-unsigned long __init
+unsigned long __meminit
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
@@ -598,6 +598,15 @@ void __init paging_init(void)
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
+
+ /*
+ * clear the default setting with node 0
+ * note: don't use nodes_clear here, that is really clearing when
+ * numa support is not compiled in, and later node_set_state
+ * will not set it back.
+ */
+ node_clear_state(0, N_NORMAL_MEMORY);
+
free_area_init_nodes(max_zone_pfns);
}
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000000..520b3bce4095
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
+obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000000..4901d0dafda6
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,228 @@
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/kmemcheck.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include "error.h"
+#include "shadow.h"
+
+enum kmemcheck_error_type {
+ KMEMCHECK_ERROR_INVALID_ACCESS,
+ KMEMCHECK_ERROR_BUG,
+};
+
+#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
+
+struct kmemcheck_error {
+ enum kmemcheck_error_type type;
+
+ union {
+ /* KMEMCHECK_ERROR_INVALID_ACCESS */
+ struct {
+ /* Kind of access that caused the error */
+ enum kmemcheck_shadow state;
+ /* Address and size of the erroneous read */
+ unsigned long address;
+ unsigned int size;
+ };
+ };
+
+ struct pt_regs regs;
+ struct stack_trace trace;
+ unsigned long trace_entries[32];
+
+ /* We compress it to a char. */
+ unsigned char shadow_copy[SHADOW_COPY_SIZE];
+ unsigned char memory_copy[SHADOW_COPY_SIZE];
+};
+
+/*
+ * Create a ring queue of errors to output. We can't call printk() directly
+ * from the kmemcheck traps, since this may call the console drivers and
+ * result in a recursive fault.
+ */
+static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
+static unsigned int error_count;
+static unsigned int error_rd;
+static unsigned int error_wr;
+static unsigned int error_missed_count;
+
+static struct kmemcheck_error *error_next_wr(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == ARRAY_SIZE(error_fifo)) {
+ ++error_missed_count;
+ return NULL;
+ }
+
+ e = &error_fifo[error_wr];
+ if (++error_wr == ARRAY_SIZE(error_fifo))
+ error_wr = 0;
+ ++error_count;
+ return e;
+}
+
+static struct kmemcheck_error *error_next_rd(void)
+{
+ struct kmemcheck_error *e;
+
+ if (error_count == 0)
+ return NULL;
+
+ e = &error_fifo[error_rd];
+ if (++error_rd == ARRAY_SIZE(error_fifo))
+ error_rd = 0;
+ --error_count;
+ return e;
+}
+
+void kmemcheck_error_recall(void)
+{
+ static const char *desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
+ [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
+ [KMEMCHECK_SHADOW_FREED] = "freed",
+ };
+
+ static const char short_desc[] = {
+ [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
+ [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
+ [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
+ [KMEMCHECK_SHADOW_FREED] = 'f',
+ };
+
+ struct kmemcheck_error *e;
+ unsigned int i;
+
+ e = error_next_rd();
+ if (!e)
+ return;
+
+ switch (e->type) {
+ case KMEMCHECK_ERROR_INVALID_ACCESS:
+ printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read "
+ "from %s memory (%p)\n",
+ 8 * e->size, e->state < ARRAY_SIZE(desc) ?
+ desc[e->state] : "(invalid shadow state)",
+ (void *) e->address);
+
+ printk(KERN_INFO);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i)
+ printk("%02x", e->memory_copy[i]);
+ printk("\n");
+
+ printk(KERN_INFO);
+ for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
+ if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
+ printk(" %c", short_desc[e->shadow_copy[i]]);
+ else
+ printk(" ?");
+ }
+ printk("\n");
+ printk(KERN_INFO "%*c\n", 2 + 2
+ * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
+ break;
+ case KMEMCHECK_ERROR_BUG:
+ printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
+ break;
+ }
+
+ __show_regs(&e->regs, 1);
+ print_stack_trace(&e->trace, 0);
+}
+
+static void do_wakeup(unsigned long data)
+{
+ while (error_count > 0)
+ kmemcheck_error_recall();
+
+ if (error_missed_count > 0) {
+ printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
+ "the queue was too small\n", error_missed_count);
+ error_missed_count = 0;
+ }
+}
+
+static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
+
+/*
+ * Save the context of an error report.
+ */
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs)
+{
+ static unsigned long prev_ip;
+
+ struct kmemcheck_error *e;
+ void *shadow_copy;
+ void *memory_copy;
+
+ /* Don't report several adjacent errors from the same EIP. */
+ if (regs->ip == prev_ip)
+ return;
+ prev_ip = regs->ip;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
+
+ e->state = state;
+ e->address = address;
+ e->size = size;
+
+ /* Save regs */
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ /* Save stack trace */
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 0;
+ save_stack_trace_bp(&e->trace, regs->bp);
+
+ /* Round address down to nearest 16 bytes */
+ shadow_copy = kmemcheck_shadow_lookup(address
+ & ~(SHADOW_COPY_SIZE - 1));
+ BUG_ON(!shadow_copy);
+
+ memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
+
+ kmemcheck_show_addr(address);
+ memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
+ memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
+ kmemcheck_hide_addr(address);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+/*
+ * Save the context of a kmemcheck bug.
+ */
+void kmemcheck_error_save_bug(struct pt_regs *regs)
+{
+ struct kmemcheck_error *e;
+
+ e = error_next_wr();
+ if (!e)
+ return;
+
+ e->type = KMEMCHECK_ERROR_BUG;
+
+ memcpy(&e->regs, regs, sizeof(*regs));
+
+ e->trace.nr_entries = 0;
+ e->trace.entries = e->trace_entries;
+ e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+ e->trace.skip = 1;
+ save_stack_trace(&e->trace);
+
+ tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000000..0efc2e8d0a20
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
+#define ARCH__X86__MM__KMEMCHECK__ERROR_H
+
+#include <linux/ptrace.h>
+
+#include "shadow.h"
+
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+ unsigned long address, unsigned int size, struct pt_regs *regs);
+
+void kmemcheck_error_save_bug(struct pt_regs *regs);
+
+void kmemcheck_error_recall(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000000..2c55ed098654
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,640 @@
+/**
+ * kmemcheck - a heavyweight memory checker for the linux kernel
+ * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
+ * (With a lot of help from Ingo Molnar and Pekka Enberg.)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2) as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kmemcheck.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "error.h"
+#include "opcode.h"
+#include "pte.h"
+#include "selftest.h"
+#include "shadow.h"
+
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 0
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+# define KMEMCHECK_ENABLED 1
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+# define KMEMCHECK_ENABLED 2
+#endif
+
+int kmemcheck_enabled = KMEMCHECK_ENABLED;
+
+int __init kmemcheck_init(void)
+{
+#ifdef CONFIG_SMP
+ /*
+ * Limit SMP to use a single CPU. We rely on the fact that this code
+ * runs before SMP is set up.
+ */
+ if (setup_max_cpus > 1) {
+ printk(KERN_INFO
+ "kmemcheck: Limiting number of CPUs to 1.\n");
+ setup_max_cpus = 1;
+ }
+#endif
+
+ if (!kmemcheck_selftest()) {
+ printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
+ kmemcheck_enabled = 0;
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "kmemcheck: Initialized\n");
+ return 0;
+}
+
+early_initcall(kmemcheck_init);
+
+/*
+ * We need to parse the kmemcheck= option before any memory is allocated.
+ */
+static int __init param_kmemcheck(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ sscanf(str, "%d", &kmemcheck_enabled);
+ return 0;
+}
+
+early_param("kmemcheck", param_kmemcheck);
+
+int kmemcheck_show_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+int kmemcheck_hide_addr(unsigned long address)
+{
+ pte_t *pte;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return 0;
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ __flush_tlb_one(address);
+ return 1;
+}
+
+struct kmemcheck_context {
+ bool busy;
+ int balance;
+
+ /*
+ * There can be at most two memory operands to an instruction, but
+ * each address can cross a page boundary -- so we may need up to
+ * four addresses that must be hidden/revealed for each fault.
+ */
+ unsigned long addr[4];
+ unsigned long n_addrs;
+ unsigned long flags;
+
+ /* Data size of the instruction that caused a fault. */
+ unsigned int size;
+};
+
+static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
+
+bool kmemcheck_active(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ return data->balance > 0;
+}
+
+/* Save an address that needs to be shown/hidden */
+static void kmemcheck_save_addr(unsigned long addr)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
+ data->addr[data->n_addrs++] = addr;
+}
+
+static unsigned int kmemcheck_show_all(void)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_show_addr(data->addr[i]);
+
+ return n;
+}
+
+static unsigned int kmemcheck_hide_all(void)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ unsigned int i;
+ unsigned int n;
+
+ n = 0;
+ for (i = 0; i < data->n_addrs; ++i)
+ n += kmemcheck_hide_addr(data->addr[i]);
+
+ return n;
+}
+
+/*
+ * Called from the #PF handler.
+ */
+void kmemcheck_show(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ BUG_ON(!irqs_disabled());
+
+ if (unlikely(data->balance != 0)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->balance = 0;
+ return;
+ }
+
+ /*
+ * None of the addresses actually belonged to kmemcheck. Note that
+ * this is not an error.
+ */
+ if (kmemcheck_show_all() == 0)
+ return;
+
+ ++data->balance;
+
+ /*
+ * The IF needs to be cleared as well, so that the faulting
+ * instruction can run "uninterrupted". Otherwise, we might take
+ * an interrupt and start executing that before we've had a chance
+ * to hide the page again.
+ *
+ * NOTE: In the rare case of multiple faults, we must not override
+ * the original flags:
+ */
+ if (!(regs->flags & X86_EFLAGS_TF))
+ data->flags = regs->flags;
+
+ regs->flags |= X86_EFLAGS_TF;
+ regs->flags &= ~X86_EFLAGS_IF;
+}
+
+/*
+ * Called from the #DB handler.
+ */
+void kmemcheck_hide(struct pt_regs *regs)
+{
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ int n;
+
+ BUG_ON(!irqs_disabled());
+
+ if (data->balance == 0)
+ return;
+
+ if (unlikely(data->balance != 1)) {
+ kmemcheck_show_all();
+ kmemcheck_error_save_bug(regs);
+ data->n_addrs = 0;
+ data->balance = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+ return;
+ }
+
+ if (kmemcheck_enabled)
+ n = kmemcheck_hide_all();
+ else
+ n = kmemcheck_show_all();
+
+ if (n == 0)
+ return;
+
+ --data->balance;
+
+ data->n_addrs = 0;
+
+ if (!(data->flags & X86_EFLAGS_TF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (data->flags & X86_EFLAGS_IF)
+ regs->flags |= X86_EFLAGS_IF;
+}
+
+void kmemcheck_show_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+bool kmemcheck_page_is_tracked(struct page *p)
+{
+ /* This will also check the "hidden" flag of the PTE. */
+ return kmemcheck_pte_lookup((unsigned long) page_address(p));
+}
+
+void kmemcheck_hide_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i) {
+ unsigned long address;
+ pte_t *pte;
+ unsigned int level;
+
+ address = (unsigned long) page_address(&p[i]);
+ pte = lookup_address(address, &level);
+ BUG_ON(!pte);
+ BUG_ON(level != PG_LEVEL_4K);
+
+ set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+ set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
+ __flush_tlb_one(address);
+ }
+}
+
+/* Access may NOT cross page boundary */
+static void kmemcheck_read_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+ enum kmemcheck_shadow status;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+
+ /* Don't warn about it again. */
+ kmemcheck_shadow_set(shadow, size);
+}
+
+/* Access may cross page boundary */
+static void kmemcheck_read(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_read_strict(regs, addr, size);
+ return;
+ }
+
+ /*
+ * What we do is basically to split the access across the
+ * two pages and handle each part separately. Yes, this means
+ * that we may now see reads that are 3 + 5 bytes, for
+ * example (and if both are uninitialized, there will be two
+ * reports), but it makes the code a lot simpler.
+ */
+ kmemcheck_read_strict(regs, addr, next_page - addr);
+ kmemcheck_read_strict(regs, next_page, next_addr - next_page);
+}
+
+static void kmemcheck_write_strict(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ void *shadow;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (!shadow)
+ return;
+
+ kmemcheck_save_addr(addr);
+ kmemcheck_shadow_set(shadow, size);
+}
+
+static void kmemcheck_write(struct pt_regs *regs,
+ unsigned long addr, unsigned int size)
+{
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long next_addr = addr + size - 1;
+ unsigned long next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ kmemcheck_write_strict(regs, addr, size);
+ return;
+ }
+
+ /* See comment in kmemcheck_read(). */
+ kmemcheck_write_strict(regs, addr, next_page - addr);
+ kmemcheck_write_strict(regs, next_page, next_addr - next_page);
+}
+
+/*
+ * Copying is hard. We have two addresses, each of which may be split across
+ * a page (and each page will have different shadow addresses).
+ */
+static void kmemcheck_copy(struct pt_regs *regs,
+ unsigned long src_addr, unsigned long dst_addr, unsigned int size)
+{
+ uint8_t shadow[8];
+ enum kmemcheck_shadow status;
+
+ unsigned long page;
+ unsigned long next_addr;
+ unsigned long next_page;
+
+ uint8_t *x;
+ unsigned int i;
+ unsigned int n;
+
+ BUG_ON(size > sizeof(shadow));
+
+ page = src_addr & PAGE_MASK;
+ next_addr = src_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < size; ++i)
+ shadow[i] = x[i];
+ } else {
+ for (i = 0; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ } else {
+ n = next_page - src_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(src_addr);
+ if (x) {
+ kmemcheck_save_addr(src_addr);
+ for (i = 0; i < n; ++i)
+ shadow[i] = x[i];
+ } else {
+ /* Not tracked */
+ for (i = 0; i < n; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i)
+ shadow[i] = x[i - n];
+ } else {
+ /* Not tracked */
+ for (i = n; i < size; ++i)
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ page = dst_addr & PAGE_MASK;
+ next_addr = dst_addr + size - 1;
+ next_page = next_addr & PAGE_MASK;
+
+ if (likely(page == next_page)) {
+ /* Same page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < size; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ } else {
+ n = next_page - dst_addr;
+ BUG_ON(n > sizeof(shadow));
+
+ /* First page */
+ x = kmemcheck_shadow_lookup(dst_addr);
+ if (x) {
+ kmemcheck_save_addr(dst_addr);
+ for (i = 0; i < n; ++i) {
+ x[i] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+
+ /* Second page */
+ x = kmemcheck_shadow_lookup(next_page);
+ if (x) {
+ kmemcheck_save_addr(next_page);
+ for (i = n; i < size; ++i) {
+ x[i - n] = shadow[i];
+ shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+ }
+ }
+ }
+
+ status = kmemcheck_shadow_test(shadow, size);
+ if (status == KMEMCHECK_SHADOW_INITIALIZED)
+ return;
+
+ if (kmemcheck_enabled)
+ kmemcheck_error_save(status, src_addr, size, regs);
+
+ if (kmemcheck_enabled == 2)
+ kmemcheck_enabled = 0;
+}
+
+enum kmemcheck_method {
+ KMEMCHECK_READ,
+ KMEMCHECK_WRITE,
+};
+
+static void kmemcheck_access(struct pt_regs *regs,
+ unsigned long fallback_address, enum kmemcheck_method fallback_method)
+{
+ const uint8_t *insn;
+ const uint8_t *insn_primary;
+ unsigned int size;
+
+ struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+
+ /* Recursive fault -- ouch. */
+ if (data->busy) {
+ kmemcheck_show_addr(fallback_address);
+ kmemcheck_error_save_bug(regs);
+ return;
+ }
+
+ data->busy = true;
+
+ insn = (const uint8_t *) regs->ip;
+ insn_primary = kmemcheck_opcode_get_primary(insn);
+
+ kmemcheck_opcode_decode(insn, &size);
+
+ switch (insn_primary[0]) {
+#ifdef CONFIG_KMEMCHECK_BITOPS_OK
+ /* AND, OR, XOR */
+ /*
+ * Unfortunately, these instructions have to be excluded from
+ * our regular checking since they access only some (and not
+ * all) bits. This clears out "bogus" bitfield-access warnings.
+ */
+ case 0x80:
+ case 0x81:
+ case 0x82:
+ case 0x83:
+ switch ((insn_primary[1] >> 3) & 7) {
+ /* OR */
+ case 1:
+ /* AND */
+ case 4:
+ /* XOR */
+ case 6:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+
+ /* ADD */
+ case 0:
+ /* ADC */
+ case 2:
+ /* SBB */
+ case 3:
+ /* SUB */
+ case 5:
+ /* CMP */
+ case 7:
+ break;
+ }
+ break;
+#endif
+
+ /* MOVS, MOVSB, MOVSW, MOVSD */
+ case 0xa4:
+ case 0xa5:
+ /*
+ * These instructions are special because they take two
+ * addresses, but we only get one page fault.
+ */
+ kmemcheck_copy(regs, regs->si, regs->di, size);
+ goto out;
+
+ /* CMPS, CMPSB, CMPSW, CMPSD */
+ case 0xa6:
+ case 0xa7:
+ kmemcheck_read(regs, regs->si, size);
+ kmemcheck_read(regs, regs->di, size);
+ goto out;
+ }
+
+ /*
+ * If the opcode isn't special in any way, we use the data from the
+ * page fault handler to determine the address and type of memory
+ * access.
+ */
+ switch (fallback_method) {
+ case KMEMCHECK_READ:
+ kmemcheck_read(regs, fallback_address, size);
+ goto out;
+ case KMEMCHECK_WRITE:
+ kmemcheck_write(regs, fallback_address, size);
+ goto out;
+ }
+
+out:
+ data->busy = false;
+}
+
+bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
+ unsigned long error_code)
+{
+ pte_t *pte;
+
+ /*
+ * XXX: Is it safe to assume that memory accesses from virtual 86
+ * mode or non-kernel code segments will _never_ access kernel
+ * memory (e.g. tracked pages)? For now, we need this to avoid
+ * invoking kmemcheck for PnP BIOS calls.
+ */
+ if (regs->flags & X86_VM_MASK)
+ return false;
+ if (regs->cs != __KERNEL_CS)
+ return false;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return false;
+
+ if (error_code & 2)
+ kmemcheck_access(regs, address, KMEMCHECK_WRITE);
+ else
+ kmemcheck_access(regs, address, KMEMCHECK_READ);
+
+ kmemcheck_show(regs);
+ return true;
+}
+
+bool kmemcheck_trap(struct pt_regs *regs)
+{
+ if (!kmemcheck_active(regs))
+ return false;
+
+ /* We're done. */
+ kmemcheck_hide(regs);
+ return true;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000000..63c19e27aa6f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
+#include <linux/types.h>
+
+#include "opcode.h"
+
+static bool opcode_is_prefix(uint8_t b)
+{
+ return
+ /* Group 1 */
+ b == 0xf0 || b == 0xf2 || b == 0xf3
+ /* Group 2 */
+ || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
+ || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e
+ /* Group 3 */
+ || b == 0x66
+ /* Group 4 */
+ || b == 0x67;
+}
+
+#ifdef CONFIG_X86_64
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return (b & 0xf0) == 0x40;
+}
+#else
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+ return false;
+}
+#endif
+
+#define REX_W (1 << 3)
+
+/*
+ * This is a VERY crude opcode decoder. We only need to find the size of the
+ * load/store that caused our #PF and this should work for all the opcodes
+ * that we care about. Moreover, the ones who invented this instruction set
+ * should be shot.
+ */
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
+{
+ /* Default operand size */
+ int operand_size_override = 4;
+
+ /* prefixes */
+ for (; opcode_is_prefix(*op); ++op) {
+ if (*op == 0x66)
+ operand_size_override = 2;
+ }
+
+ /* REX prefix */
+ if (opcode_is_rex_prefix(*op)) {
+ uint8_t rex = *op;
+
+ ++op;
+ if (rex & REX_W) {
+ switch (*op) {
+ case 0x63:
+ *size = 4;
+ return;
+ case 0x0f:
+ ++op;
+
+ switch (*op) {
+ case 0xb6:
+ case 0xbe:
+ *size = 1;
+ return;
+ case 0xb7:
+ case 0xbf:
+ *size = 2;
+ return;
+ }
+
+ break;
+ }
+
+ *size = 8;
+ return;
+ }
+ }
+
+ /* escape opcode */
+ if (*op == 0x0f) {
+ ++op;
+
+ /*
+ * This is move with zero-extend and sign-extend, respectively;
+ * we don't have to think about 0xb6/0xbe, because this is
+ * already handled in the conditional below.
+ */
+ if (*op == 0xb7 || *op == 0xbf)
+ operand_size_override = 2;
+ }
+
+ *size = (*op & 1) ? operand_size_override : 1;
+}
+
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
+{
+ /* skip prefixes */
+ while (opcode_is_prefix(*op))
+ ++op;
+ if (opcode_is_rex_prefix(*op))
+ ++op;
+ return op;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000000..6956aad66b5b
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
+#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
+
+#include <linux/types.h>
+
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000000..4ead26eeaf96
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+#include "pte.h"
+
+pte_t *kmemcheck_pte_lookup(unsigned long address)
+{
+ pte_t *pte;
+ unsigned int level;
+
+ pte = lookup_address(address, &level);
+ if (!pte)
+ return NULL;
+ if (level != PG_LEVEL_4K)
+ return NULL;
+ if (!pte_hidden(*pte))
+ return NULL;
+
+ return pte;
+}
+
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000000..9f5966456492
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
+#define ARCH__X86__MM__KMEMCHECK__PTE_H
+
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+pte_t *kmemcheck_pte_lookup(unsigned long address);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000000..036efbea8b28
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,69 @@
+#include <linux/kernel.h>
+
+#include "opcode.h"
+#include "selftest.h"
+
+struct selftest_opcode {
+ unsigned int expected_size;
+ const uint8_t *insn;
+ const char *desc;
+};
+
+static const struct selftest_opcode selftest_opcodes[] = {
+ /* REP MOVS */
+ {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
+ {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
+
+ /* MOVZX / MOVZXD */
+ {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
+ {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
+ {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
+
+#ifdef CONFIG_X86_64
+ /* MOVZX / MOVZXD */
+ {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
+
+ /* MOVSX / MOVSXD */
+ {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
+ {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
+ {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
+#endif
+};
+
+static bool selftest_opcode_one(const struct selftest_opcode *op)
+{
+ unsigned size;
+
+ kmemcheck_opcode_decode(op->insn, &size);
+
+ if (size == op->expected_size)
+ return true;
+
+ printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
+ op->desc, op->expected_size, size);
+ return false;
+}
+
+static bool selftest_opcodes_all(void)
+{
+ bool pass = true;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
+ pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
+
+ return pass;
+}
+
+bool kmemcheck_selftest(void)
+{
+ bool pass = true;
+
+ pass = pass && selftest_opcodes_all();
+
+ return pass;
+}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000000..8fed4fe11f95
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
+#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+
+bool kmemcheck_selftest(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000000..e773b6bd0079
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,162 @@
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include "pte.h"
+#include "shadow.h"
+
+/*
+ * Return the shadow address for the given address. Returns NULL if the
+ * address is not tracked.
+ *
+ * We need to be extremely careful not to follow any invalid pointers,
+ * because this function can be called for *any* possible address.
+ */
+void *kmemcheck_shadow_lookup(unsigned long address)
+{
+ pte_t *pte;
+ struct page *page;
+
+ if (!virt_addr_valid(address))
+ return NULL;
+
+ pte = kmemcheck_pte_lookup(address);
+ if (!pte)
+ return NULL;
+
+ page = virt_to_page(address);
+ if (!page->shadow)
+ return NULL;
+ return page->shadow + (address & (PAGE_SIZE - 1));
+}
+
+static void mark_shadow(void *address, unsigned int n,
+ enum kmemcheck_shadow status)
+{
+ unsigned long addr = (unsigned long) address;
+ unsigned long last_addr = addr + n - 1;
+ unsigned long page = addr & PAGE_MASK;
+ unsigned long last_page = last_addr & PAGE_MASK;
+ unsigned int first_n;
+ void *shadow;
+
+ /* If the memory range crosses a page boundary, stop there. */
+ if (page == last_page)
+ first_n = n;
+ else
+ first_n = page + PAGE_SIZE - addr;
+
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, first_n);
+
+ addr += first_n;
+ n -= first_n;
+
+ /* Do full-page memset()s. */
+ while (n >= PAGE_SIZE) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, PAGE_SIZE);
+
+ addr += PAGE_SIZE;
+ n -= PAGE_SIZE;
+ }
+
+ /* Do the remaining page, if any. */
+ if (n > 0) {
+ shadow = kmemcheck_shadow_lookup(addr);
+ if (shadow)
+ memset(shadow, status, n);
+ }
+}
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
+}
+
+void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
+}
+
+/*
+ * Fill the shadow memory of the given address such that the memory at that
+ * address is marked as being initialized.
+ */
+void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
+}
+EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
+
+void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+ mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
+}
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
+{
+ unsigned int i;
+
+ for (i = 0; i < n; ++i)
+ kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
+ /*
+ * Make sure _some_ bytes are initialized. Gcc frequently generates
+ * code to access neighboring bytes.
+ */
+ for (i = 0; i < size; ++i) {
+ if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+#else
+ /* All bytes must be initialized. */
+ for (i = 0; i < size; ++i) {
+ if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
+ return x[i];
+ }
+#endif
+
+ return x[0];
+}
+
+void kmemcheck_shadow_set(void *shadow, unsigned int size)
+{
+ uint8_t *x;
+ unsigned int i;
+
+ x = shadow;
+ for (i = 0; i < size; ++i)
+ x[i] = KMEMCHECK_SHADOW_INITIALIZED;
+}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000000..af46d9ab9d86
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,16 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
+#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
+
+enum kmemcheck_shadow {
+ KMEMCHECK_SHADOW_UNALLOCATED,
+ KMEMCHECK_SHADOW_UNINITIALIZED,
+ KMEMCHECK_SHADOW_INITIALIZED,
+ KMEMCHECK_SHADOW_FREED,
+};
+
+void *kmemcheck_shadow_lookup(unsigned long address);
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+void kmemcheck_shadow_set(void *shadow, unsigned int size);
+
+#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6ce9518fe2ac..7e600c1962db 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -11,6 +11,7 @@
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/pfn.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -470,7 +471,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
if (!debug_pagealloc)
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL, 0);
+ base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
if (!debug_pagealloc)
spin_lock(&cpa_lock);
if (!base)
@@ -590,9 +591,12 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_PAGES_ARRAY)
- address = (unsigned long)page_address(cpa->pages[cpa->curpage]);
- else if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ address = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
address = cpa->vaddr[cpa->curpage];
else
address = *cpa->vaddr;
@@ -681,8 +685,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
- int ret = 0;
- unsigned long temp_cpa_vaddr, vaddr;
+ unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+ unsigned long vaddr, remapped;
+ int ret;
if (cpa->pfn >= max_pfn_mapped)
return 0;
@@ -695,9 +700,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_PAGES_ARRAY)
- vaddr = (unsigned long)page_address(cpa->pages[cpa->curpage]);
- else if (cpa->flags & CPA_ARRAY)
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[cpa->curpage];
+ if (unlikely(PageHighMem(page)))
+ return 0;
+ vaddr = (unsigned long)page_address(page);
+ } else if (cpa->flags & CPA_ARRAY)
vaddr = cpa->vaddr[cpa->curpage];
else
vaddr = *cpa->vaddr;
@@ -706,42 +714,55 @@ static int cpa_process_alias(struct cpa_data *cpa)
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
alias_cpa = *cpa;
- temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
- alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.vaddr = &laddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
-
ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ if (ret)
+ return ret;
}
#ifdef CONFIG_X86_64
- if (ret)
- return ret;
- /*
- * No need to redo, when the primary call touched the high
- * mapping already:
- */
- if (within(vaddr, (unsigned long) _text, _brk_end))
- return 0;
-
/*
- * If the physical address is inside the kernel map, we need
+ * If the primary call didn't touch the high mapping already
+ * and the physical address is inside the kernel map, we need
* to touch the high mapped kernel as well:
*/
- if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn()))
- return 0;
+ if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+ within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+ unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+ __START_KERNEL_map - phys_base;
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &temp_cpa_vaddr;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
- alias_cpa = *cpa;
- temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
- alias_cpa.vaddr = &temp_cpa_vaddr;
- alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+ /*
+ * The high mapping range is imprecise, so ignore the
+ * return value.
+ */
+ __change_page_attr_set_clr(&alias_cpa, 0);
+ }
+#endif
/*
- * The high mapping range is imprecise, so ignore the return value.
+ * If the PMD page was partially used for per-cpu remapping,
+ * the recycled area needs to be split and modified. Because
+ * the area is always proper subset of a PMD page
+ * cpa->numpages is guaranteed to be 1 for these areas, so
+ * there's no need to loop over and check for further remaps.
*/
- __change_page_attr_set_clr(&alias_cpa, 0);
-#endif
- return ret;
+ remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
+ if (remapped) {
+ WARN_ON(cpa->numpages > 1);
+ alias_cpa = *cpa;
+ alias_cpa.vaddr = &remapped;
+ alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+ ret = __change_page_attr_set_clr(&alias_cpa, 0);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
}
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
@@ -982,12 +1003,15 @@ EXPORT_SYMBOL(set_memory_array_uc);
int _set_memory_wc(unsigned long addr, int numpages)
{
int ret;
+ unsigned long addr_copy = addr;
+
ret = change_page_attr_set(&addr, numpages,
__pgprot(_PAGE_CACHE_UC_MINUS), 0);
-
if (!ret) {
- ret = change_page_attr_set(&addr, numpages,
- __pgprot(_PAGE_CACHE_WC), 0);
+ ret = change_page_attr_set_clr(&addr_copy, numpages,
+ __pgprot(_PAGE_CACHE_WC),
+ __pgprot(_PAGE_CACHE_MASK),
+ 0, 0, NULL);
}
return ret;
}
@@ -1104,7 +1128,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
int free_idx;
for (i = 0; i < addrinarray; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
goto err_out;
@@ -1117,7 +1143,9 @@ int set_pages_array_uc(struct page **pages, int addrinarray)
err_out:
free_idx = i;
for (i = 0; i < free_idx; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
free_memtype(start, end);
}
@@ -1146,7 +1174,9 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
return retval;
for (i = 0; i < addrinarray; i++) {
- start = (unsigned long)page_address(pages[i]);
+ if (PageHighMem(pages[i]))
+ continue;
+ start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
free_memtype(start, end);
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7aa03a5389f5..ed34f5e35999 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,9 +4,11 @@
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+ return (pte_t *)__get_free_page(PGALLOC_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -14,16 +16,16 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
struct page *pte;
#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+ pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+ pte = alloc_pages(PGALLOC_GFP, 0);
#endif
if (pte)
pgtable_page_ctor(pte);
return pte;
}
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
@@ -31,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
}
#if PAGETABLE_LEVELS > 2
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
#if PAGETABLE_LEVELS > 3
-void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
@@ -161,7 +163,7 @@ static int preallocate_pmds(pmd_t *pmds[])
bool failed = false;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+ pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
if (pmd == NULL)
failed = true;
pmds[i] = pmd;
@@ -228,7 +230,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
pmd_t *pmds[PREALLOCATED_PMDS];
unsigned long flags;
- pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
if (pgd == NULL)
goto out;
@@ -327,7 +329,6 @@ void __init reserve_top_address(unsigned long reserve)
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
- __VMALLOC_RESERVE += reserve;
#endif
}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 2dfcbf9df2ae..dbb5381f7b3b 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -79,8 +79,10 @@ static __init void bad_srat(void)
acpi_numa = -1;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
- for (i = 0; i < MAX_NUMNODES; i++)
- nodes_add[i].start = nodes[i].end = 0;
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ nodes[i].start = nodes[i].end = 0;
+ nodes_add[i].start = nodes_add[i].end = 0;
+ }
remove_all_active_ranges();
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 821e97017e95..c814e144a3f0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -183,18 +183,17 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
f->flush_mm = mm;
f->flush_va = va;
- cpumask_andnot(to_cpumask(f->flush_cpumask),
- cpumask, cpumask_of(smp_processor_id()));
-
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
- INVALIDATE_TLB_VECTOR_START + sender);
+ if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
+ /*
+ * We have to send the IPI only to
+ * CPUs affected.
+ */
+ apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
+ INVALIDATE_TLB_VECTOR_START + sender);
- while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
- cpu_relax();
+ while (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+ cpu_relax();
+ }
f->flush_mm = NULL;
f->flush_va = 0;