From b3bba872ddb0320a7ecb54decae53c13ceb2ed4c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 8 Dec 2011 16:53:54 -0600
Subject: writeback: show writeback reason with __print_symbolic

This makes the binary trace understandable by trace-cmd.

CC: Dave Chinner <david@fromorbit.com>
CC: Curt Wohlgemuth <curtw@google.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/trace/events/writeback.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index b99caa8b780c..99d1d0decf88 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -21,6 +21,16 @@
 		{I_REFERENCED,		"I_REFERENCED"}		\
 	)
 
+#define WB_WORK_REASON							\
+		{WB_REASON_BACKGROUND,		"background"},		\
+		{WB_REASON_TRY_TO_FREE_PAGES,	"try_to_free_pages"},	\
+		{WB_REASON_SYNC,		"sync"},		\
+		{WB_REASON_PERIODIC,		"periodic"},		\
+		{WB_REASON_LAPTOP_TIMER,	"laptop_timer"},	\
+		{WB_REASON_FREE_MORE_MEM,	"free_more_memory"},	\
+		{WB_REASON_FS_FREE_SPACE,	"fs_free_space"},	\
+		{WB_REASON_FORKER_THREAD,	"forker_thread"}
+
 struct wb_writeback_work;
 
 DECLARE_EVENT_CLASS(writeback_work_class,
@@ -55,7 +65,7 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
 		  __entry->for_background,
-		  wb_reason_name[__entry->reason]
+		  __print_symbolic(__entry->reason, WB_WORK_REASON)
 	)
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
@@ -184,7 +194,8 @@ TRACE_EVENT(writeback_queue_io,
 		__entry->older,	/* older_than_this in jiffies */
 		__entry->age,	/* older_than_this in relative milliseconds */
 		__entry->moved,
-		wb_reason_name[__entry->reason])
+		__print_symbolic(__entry->reason, WB_WORK_REASON)
+	)
 );
 
 TRACE_EVENT(global_dirty_state,
-- 
cgit v1.2.3


From e30e2fdfe56288576ee9e04dbb06b4bd5f282203 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Thu, 22 Dec 2011 02:45:29 +0530
Subject: VFS: Fix race between CPU hotplug and lglocks

Currently, the *_global_[un]lock_online() routines are not at all synchronized
with CPU hotplug. Soft-lockups detected as a consequence of this race was
reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng
for finding out that the root-cause of this issue is the race condition
between br_write_[un]lock() and CPU hotplug, which results in the lock states
getting messed up).

Fixing this race by just adding {get,put}_online_cpus() at appropriate places
in *_global_[un]lock_online() is not a good option, because, then suddenly
br_write_[un]lock() would become blocking, whereas they have been kept as
non-blocking all this time, and we would want to keep them that way.

So, overall, we want to ensure 3 things:
1. br_write_lock() and br_write_unlock() must remain as non-blocking.
2. The corresponding lock and unlock of the per-cpu spinlocks must not happen
   for different sets of CPUs.
3. Either prevent any new CPU online operation in between this lock-unlock, or
   ensure that the newly onlined CPU does not proceed with its corresponding
   per-cpu spinlock unlocked.

To achieve all this:
(a) We introduce a new spinlock that is taken by the *_global_lock_online()
    routine and released by the *_global_unlock_online() routine.
(b) We register a callback for CPU hotplug notifications, and this callback
    takes the same spinlock as above.
(c) We maintain a bitmap which is close to the cpu_online_mask, and once it is
    initialized in the lock_init() code, all future updates to it are done in
    the callback, under the above spinlock.
(d) The above bitmap is used (instead of cpu_online_mask) while locking and
    unlocking the per-cpu locks.

The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the
br_write_lock-unlock sequence is in progress, the callback keeps spinning,
thus preventing the CPU online operation till the lock-unlock sequence is
complete. This takes care of requirement (3).

The bitmap that we maintain remains unmodified throughout the lock-unlock
sequence, since all updates to it are managed by the callback, which takes
the same spinlock as the one taken by the lock code and released only by the
unlock routine. Combining this with (d) above, satisfies requirement (2).

Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug
operations from racing with br_write_lock-unlock, requirement (1) is also
taken care of.

By the way, it is to be noted that a CPU offline operation can actually run
in parallel with our lock-unlock sequence, because our callback doesn't react
to notifications earlier than CPU_DEAD (in order to maintain our bitmap
properly). And this means, since we use our own bitmap (which is stale, on
purpose) during the lock-unlock sequence, we could end up unlocking the
per-cpu lock of an offline CPU (because we had locked it earlier, when the
CPU was online), in order to satisfy requirement (2). But this is harmless,
though it looks a bit awkward.

Debugged-by: Cong Meng <mc@linux.vnet.ibm.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
---
 include/linux/lglock.h | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index f549056fb20b..87f402ccec55 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -22,6 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
+#include <linux/cpu.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
 #define br_lock_init(name)	name##_lock_init()
@@ -72,9 +73,31 @@
 
 #define DEFINE_LGLOCK(name)						\
 									\
+ DEFINE_SPINLOCK(name##_cpu_lock);					\
+ cpumask_t name##_cpus __read_mostly;					\
  DEFINE_PER_CPU(arch_spinlock_t, name##_lock);				\
  DEFINE_LGLOCK_LOCKDEP(name);						\
 									\
+ static int								\
+ name##_lg_cpu_callback(struct notifier_block *nb,			\
+				unsigned long action, void *hcpu)	\
+ {									\
+	switch (action & ~CPU_TASKS_FROZEN) {				\
+	case CPU_UP_PREPARE:						\
+		spin_lock(&name##_cpu_lock);				\
+		cpu_set((unsigned long)hcpu, name##_cpus);		\
+		spin_unlock(&name##_cpu_lock);				\
+		break;							\
+	case CPU_UP_CANCELED: case CPU_DEAD:				\
+		spin_lock(&name##_cpu_lock);				\
+		cpu_clear((unsigned long)hcpu, name##_cpus);		\
+		spin_unlock(&name##_cpu_lock);				\
+	}								\
+	return NOTIFY_OK;						\
+ }									\
+ static struct notifier_block name##_lg_cpu_notifier = {		\
+	.notifier_call = name##_lg_cpu_callback,			\
+ };									\
  void name##_lock_init(void) {						\
 	int i;								\
 	LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
@@ -83,6 +106,11 @@
 		lock = &per_cpu(name##_lock, i);			\
 		*lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;	\
 	}								\
+	register_hotcpu_notifier(&name##_lg_cpu_notifier);		\
+	get_online_cpus();						\
+	for_each_online_cpu(i)						\
+		cpu_set(i, name##_cpus);				\
+	put_online_cpus();						\
  }									\
  EXPORT_SYMBOL(name##_lock_init);					\
 									\
@@ -124,9 +152,9 @@
 									\
  void name##_global_lock_online(void) {					\
 	int i;								\
-	preempt_disable();						\
+	spin_lock(&name##_cpu_lock);					\
 	rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);		\
-	for_each_online_cpu(i) {					\
+	for_each_cpu(i, &name##_cpus) {					\
 		arch_spinlock_t *lock;					\
 		lock = &per_cpu(name##_lock, i);			\
 		arch_spin_lock(lock);					\
@@ -137,12 +165,12 @@
  void name##_global_unlock_online(void) {				\
 	int i;								\
 	rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);		\
-	for_each_online_cpu(i) {					\
+	for_each_cpu(i, &name##_cpus) {					\
 		arch_spinlock_t *lock;					\
 		lock = &per_cpu(name##_lock, i);			\
 		arch_spin_unlock(lock);					\
 	}								\
-	preempt_enable();						\
+	spin_unlock(&name##_cpu_lock);					\
  }									\
  EXPORT_SYMBOL(name##_global_unlock_online);				\
 									\
-- 
cgit v1.2.3


From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 21 Dec 2011 12:28:29 +0100
Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid

Unlike all of the other cpuid bits, the TSC deadline timer bit is set
unconditionally, regardless of what userspace wants.

This is broken in several ways:
 - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC
   deadline timer feature, a guest that uses the feature will break
 - live migration to older host kernels that don't support the TSC deadline
   timer will cause the feature to be pulled from under the guest's feet;
   breaking it
 - guests that are broken wrt the feature will fail.

Fix by not enabling the feature automatically; instead report it to userspace.
Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee
will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not
KVM_GET_SUPPORTED_CPUID.

Fixes the Illumos guest kernel, which uses the TSC deadline timer feature.

[avi: add the KVM_CAP + documentation]

Reported-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Tested-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 +++++++++
 arch/x86/kvm/x86.c                | 19 +++++++++----------
 include/linux/kvm.h               |  1 +
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4df9af4f6132..e2a4b5287361 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows:
    eax, ebx, ecx, edx: the values returned by the cpuid instruction for
          this function/index combination
 
+The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned
+as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC
+support.  Instead it is reported via
+
+  ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER)
+
+if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the
+feature in userspace, then you can enable the feature for KVM_SET_CPUID2.
+
 4.47 KVM_PPC_GET_PVINFO
 
 Capability: KVM_CAP_PPC_GET_PVINFO
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c38efd7b792e..4c938da2ba00 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	u32 timer_mode_mask;
 
 	best = kvm_find_cpuid_entry(vcpu, 1, 0);
 	if (!best)
@@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 			best->ecx |= bit(X86_FEATURE_OSXSAVE);
 	}
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-		best->function == 0x1) {
-		best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
-		timer_mode_mask = 3 << 17;
-	} else
-		timer_mode_mask = 1 << 17;
-
-	if (apic)
-		apic->lapic_timer.timer_mode_mask = timer_mode_mask;
+	if (apic) {
+		if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+			apic->lapic_timer.timer_mode_mask = 3 << 17;
+		else
+			apic->lapic_timer.timer_mode_mask = 1 << 17;
+	}
 }
 
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_TSC_CONTROL:
 		r = kvm_has_tsc_control;
 		break;
+	case KVM_CAP_TSC_DEADLINE_TIMER:
+		r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
+		break;
 	default:
 		r = 0;
 		break;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index c3892fc1d538..68e67e50d028 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_S390_GMAP 71
+#define KVM_CAP_TSC_DEADLINE_TIMER 72
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 34845636a184f3be91a531098192592cbe6db587 Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Wed, 28 Dec 2011 15:57:15 -0800
Subject: procfs: do not confuse jiffies with cputime64_t

Commit 2a95ea6c0d129b4 ("procfs: do not overflow get_{idle,iowait}_time
for nohz") did not take into account that one some architectures jiffies
and cputime use different units.

This causes get_idle_time() to return numbers in the wrong units, making
the idle time fields in /proc/stat wrong.

Instead of converting the usec value returned by
get_cpu_{idle,iowait}_time_us to units of jiffies, use the new function
usecs_to_cputime64 to convert it to the correct unit of cputime64_t.

Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Artem S. Tashkinov" <t.artem@mailcity.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/include/asm/cputime.h    | 1 +
 arch/powerpc/include/asm/cputime.h | 2 ++
 arch/s390/include/asm/cputime.h    | 2 ++
 fs/proc/stat.c                     | 4 ++--
 include/asm-generic/cputime.h      | 1 +
 5 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 6073b187528a..5a274af31b2b 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h
@@ -60,6 +60,7 @@ typedef u64 cputime64_t;
  */
 #define cputime_to_usecs(__ct)		((__ct) / NSEC_PER_USEC)
 #define usecs_to_cputime(__usecs)	((__usecs) * NSEC_PER_USEC)
+#define usecs_to_cputime64(__usecs)	usecs_to_cputime(__usecs)
 
 /*
  * Convert cputime <-> seconds
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 1cf20bdfbeca..98b7c4b49c9d 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -150,6 +150,8 @@ static inline cputime_t usecs_to_cputime(const unsigned long us)
 	return ct;
 }
 
+#define usecs_to_cputime64(us)		usecs_to_cputime(us)
+
 /*
  * Convert cputime <-> seconds
  */
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 081434878296..b9acaaa175d8 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -87,6 +87,8 @@ usecs_to_cputime(const unsigned int m)
 	return (cputime_t) m * 4096;
 }
 
+#define usecs_to_cputime64(m)		usecs_to_cputime(m)
+
 /*
  * Convert cputime to milliseconds and back.
  */
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 2a30d67dd6b8..0855e6f20391 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu)
 		idle = kstat_cpu(cpu).cpustat.idle;
 		idle = cputime64_add(idle, arch_idle_time(cpu));
 	} else
-		idle = nsecs_to_jiffies64(1000 * idle_time);
+		idle = usecs_to_cputime64(idle_time);
 
 	return idle;
 }
@@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu)
 		/* !NO_HZ so we can rely on cpustat.iowait */
 		iowait = kstat_cpu(cpu).cpustat.iowait;
 	else
-		iowait = nsecs_to_jiffies64(1000 * iowait_time);
+		iowait = usecs_to_cputime64(iowait_time);
 
 	return iowait;
 }
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 62ce6823c0f2..12a1764f612b 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -40,6 +40,7 @@ typedef u64 cputime64_t;
  */
 #define cputime_to_usecs(__ct)		jiffies_to_usecs(__ct)
 #define usecs_to_cputime(__msecs)	usecs_to_jiffies(__msecs)
+#define usecs_to_cputime64(__msecs)	nsecs_to_jiffies64((__msecs) * 1000)
 
 /*
  * Convert cputime to seconds and back.
-- 
cgit v1.2.3