summaryrefslogtreecommitdiff
path: root/virt
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2019-05-10 11:43:46 -0700
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2019-05-10 11:43:46 -0700
commit2a267e7c41aa88215de2b542de797d03d16ecdfd (patch)
treeb949270835e304c8b60a40cde1b2c2e19c13b33a /virt
parent0981949da8f7498b96c6e0ae60680865ca283bf1 (diff)
parente93c9c99a629c61837d5a7fc2120cd2b6c70dbdd (diff)
downloadlwn-2a267e7c41aa88215de2b542de797d03d16ecdfd.tar.gz
lwn-2a267e7c41aa88215de2b542de797d03d16ecdfd.zip
Merge tag 'v5.1' into next
Sync up with mainline to bring in the latest APIs.
Diffstat (limited to 'virt')
-rw-r--r--virt/kvm/arm/arch_timer.c640
-rw-r--r--virt/kvm/arm/arm.c132
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c12
-rw-r--r--virt/kvm/arm/mmio.c11
-rw-r--r--virt/kvm/arm/mmu.c490
-rw-r--r--virt/kvm/arm/psci.c36
-rw-r--r--virt/kvm/arm/trace.h125
-rw-r--r--virt/kvm/arm/vgic/vgic-debug.c4
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c30
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c53
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c14
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c15
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c78
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c4
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c16
-rw-r--r--virt/kvm/arm/vgic/vgic.c174
-rw-r--r--virt/kvm/arm/vgic/vgic.h1
-rw-r--r--virt/kvm/async_pf.c2
-rw-r--r--virt/kvm/coalesced_mmio.c3
-rw-r--r--virt/kvm/eventfd.c13
-rw-r--r--virt/kvm/irqchip.c9
-rw-r--r--virt/kvm/kvm_main.c340
-rw-r--r--virt/kvm/vfio.c4
23 files changed, 1479 insertions, 727 deletions
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 17cecc96f735..7fc272ecae16 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -25,6 +25,7 @@
#include <clocksource/arm_arch_timer.h>
#include <asm/arch_timer.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <kvm/arm_vgic.h>
@@ -34,7 +35,9 @@
static struct timecounter *timecounter;
static unsigned int host_vtimer_irq;
+static unsigned int host_ptimer_irq;
static u32 host_vtimer_irq_flags;
+static u32 host_ptimer_irq_flags;
static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
@@ -52,12 +55,34 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
struct arch_timer_context *timer_ctx);
static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg,
+ u64 val);
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg);
u64 kvm_phys_timer_read(void)
{
return timecounter->cc->read(timecounter->cc);
}
+static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
+{
+ if (has_vhe()) {
+ map->direct_vtimer = vcpu_vtimer(vcpu);
+ map->direct_ptimer = vcpu_ptimer(vcpu);
+ map->emul_ptimer = NULL;
+ } else {
+ map->direct_vtimer = vcpu_vtimer(vcpu);
+ map->direct_ptimer = NULL;
+ map->emul_ptimer = vcpu_ptimer(vcpu);
+ }
+
+ trace_kvm_get_timer_map(vcpu->vcpu_id, map);
+}
+
static inline bool userspace_irqchip(struct kvm *kvm)
{
return static_branch_unlikely(&userspace_irqchip_in_use) &&
@@ -70,30 +95,35 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns)
HRTIMER_MODE_ABS);
}
-static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
+static void soft_timer_cancel(struct hrtimer *hrt)
{
hrtimer_cancel(hrt);
- if (work)
- cancel_work_sync(work);
}
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
- struct arch_timer_context *vtimer;
+ struct arch_timer_context *ctx;
+ struct timer_map map;
/*
* We may see a timer interrupt after vcpu_put() has been called which
* sets the CPU's vcpu pointer to NULL, because even though the timer
- * has been disabled in vtimer_save_state(), the hardware interrupt
+ * has been disabled in timer_save_state(), the hardware interrupt
* signal may not have been retired from the interrupt controller yet.
*/
if (!vcpu)
return IRQ_HANDLED;
- vtimer = vcpu_vtimer(vcpu);
- if (kvm_timer_should_fire(vtimer))
- kvm_timer_update_irq(vcpu, true, vtimer);
+ get_timer_map(vcpu, &map);
+
+ if (irq == host_vtimer_irq)
+ ctx = map.direct_vtimer;
+ else
+ ctx = map.direct_ptimer;
+
+ if (kvm_timer_should_fire(ctx))
+ kvm_timer_update_irq(vcpu, true, ctx);
if (userspace_irqchip(vcpu->kvm) &&
!static_branch_unlikely(&has_gic_active_state))
@@ -102,23 +132,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
return IRQ_HANDLED;
}
-/*
- * Work function for handling the backup timer that we schedule when a vcpu is
- * no longer running, but had a timer programmed to fire in the future.
- */
-static void kvm_timer_inject_irq_work(struct work_struct *work)
-{
- struct kvm_vcpu *vcpu;
-
- vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
-
- /*
- * If the vcpu is blocked we want to wake it up so that it will see
- * the timer has expired when entering the guest.
- */
- kvm_vcpu_wake_up(vcpu);
-}
-
static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
{
u64 cval, now;
@@ -141,7 +154,9 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
{
- return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+ WARN_ON(timer_ctx && timer_ctx->loaded);
+ return timer_ctx &&
+ !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE);
}
@@ -151,21 +166,22 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
*/
static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
{
- u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ u64 min_delta = ULLONG_MAX;
+ int i;
- if (kvm_timer_irq_can_fire(vtimer))
- min_virt = kvm_timer_compute_delta(vtimer);
+ for (i = 0; i < NR_KVM_TIMERS; i++) {
+ struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];
- if (kvm_timer_irq_can_fire(ptimer))
- min_phys = kvm_timer_compute_delta(ptimer);
+ WARN(ctx->loaded, "timer %d loaded\n", i);
+ if (kvm_timer_irq_can_fire(ctx))
+ min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
+ }
/* If none of timers can fire, then return 0 */
- if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX))
+ if (min_delta == ULLONG_MAX)
return 0;
- return min(min_virt, min_phys);
+ return min_delta;
}
static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
@@ -188,45 +204,62 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
- schedule_work(&timer->expired);
+ kvm_vcpu_wake_up(vcpu);
return HRTIMER_NORESTART;
}
-static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt)
+static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
{
- struct arch_timer_context *ptimer;
- struct arch_timer_cpu *timer;
+ struct arch_timer_context *ctx;
struct kvm_vcpu *vcpu;
u64 ns;
- timer = container_of(hrt, struct arch_timer_cpu, phys_timer);
- vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);
- ptimer = vcpu_ptimer(vcpu);
+ ctx = container_of(hrt, struct arch_timer_context, hrtimer);
+ vcpu = ctx->vcpu;
+
+ trace_kvm_timer_hrtimer_expire(ctx);
/*
* Check that the timer has really expired from the guest's
* PoV (NTP on the host may have forced it to expire
* early). If not ready, schedule for a later time.
*/
- ns = kvm_timer_compute_delta(ptimer);
+ ns = kvm_timer_compute_delta(ctx);
if (unlikely(ns)) {
hrtimer_forward_now(hrt, ns_to_ktime(ns));
return HRTIMER_RESTART;
}
- kvm_timer_update_irq(vcpu, true, ptimer);
+ kvm_timer_update_irq(vcpu, true, ctx);
return HRTIMER_NORESTART;
}
static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
{
+ enum kvm_arch_timers index;
u64 cval, now;
+ if (!timer_ctx)
+ return false;
+
+ index = arch_timer_ctx_index(timer_ctx);
+
if (timer_ctx->loaded) {
- u32 cnt_ctl;
+ u32 cnt_ctl = 0;
+
+ switch (index) {
+ case TIMER_VTIMER:
+ cnt_ctl = read_sysreg_el0(cntv_ctl);
+ break;
+ case TIMER_PTIMER:
+ cnt_ctl = read_sysreg_el0(cntp_ctl);
+ break;
+ case NR_KVM_TIMERS:
+ /* GCC is braindead */
+ cnt_ctl = 0;
+ break;
+ }
- /* Only the virtual timer can be loaded so far */
- cnt_ctl = read_sysreg_el0(cntv_ctl);
return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
(cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
!(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
@@ -243,13 +276,13 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
bool kvm_timer_is_pending(struct kvm_vcpu *vcpu)
{
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct timer_map map;
- if (kvm_timer_should_fire(vtimer))
- return true;
+ get_timer_map(vcpu, &map);
- return kvm_timer_should_fire(ptimer);
+ return kvm_timer_should_fire(map.direct_vtimer) ||
+ kvm_timer_should_fire(map.direct_ptimer) ||
+ kvm_timer_should_fire(map.emul_ptimer);
}
/*
@@ -288,77 +321,70 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
}
}
-/* Schedule the background timer for the emulated timer. */
-static void phys_timer_emulate(struct kvm_vcpu *vcpu)
+static void timer_emulate(struct arch_timer_context *ctx)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ bool should_fire = kvm_timer_should_fire(ctx);
+
+ trace_kvm_timer_emulate(ctx, should_fire);
+
+ if (should_fire) {
+ kvm_timer_update_irq(ctx->vcpu, true, ctx);
+ return;
+ }
/*
* If the timer can fire now, we don't need to have a soft timer
* scheduled for the future. If the timer cannot fire at all,
* then we also don't need a soft timer.
*/
- if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
- soft_timer_cancel(&timer->phys_timer, NULL);
+ if (!kvm_timer_irq_can_fire(ctx)) {
+ soft_timer_cancel(&ctx->hrtimer);
return;
}
- soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer));
+ soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
}
-/*
- * Check if there was a change in the timer state, so that we should either
- * raise or lower the line level to the GIC or schedule a background timer to
- * emulate the physical timer.
- */
-static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
+static void timer_save_state(struct arch_timer_context *ctx)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
- bool level;
+ struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+ enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
+ unsigned long flags;
- if (unlikely(!timer->enabled))
+ if (!timer->enabled)
return;
- /*
- * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part
- * of its lifecycle is offloaded to the hardware, and we therefore may
- * not have lowered the irq.level value before having to signal a new
- * interrupt, but have to signal an interrupt every time the level is
- * asserted.
- */
- level = kvm_timer_should_fire(vtimer);
- kvm_timer_update_irq(vcpu, level, vtimer);
+ local_irq_save(flags);
- phys_timer_emulate(vcpu);
+ if (!ctx->loaded)
+ goto out;
- if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
- kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
-}
+ switch (index) {
+ case TIMER_VTIMER:
+ ctx->cnt_ctl = read_sysreg_el0(cntv_ctl);
+ ctx->cnt_cval = read_sysreg_el0(cntv_cval);
-static void vtimer_save_state(struct kvm_vcpu *vcpu)
-{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- unsigned long flags;
+ /* Disable the timer */
+ write_sysreg_el0(0, cntv_ctl);
+ isb();
- local_irq_save(flags);
+ break;
+ case TIMER_PTIMER:
+ ctx->cnt_ctl = read_sysreg_el0(cntp_ctl);
+ ctx->cnt_cval = read_sysreg_el0(cntp_cval);
- if (!vtimer->loaded)
- goto out;
+ /* Disable the timer */
+ write_sysreg_el0(0, cntp_ctl);
+ isb();
- if (timer->enabled) {
- vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl);
- vtimer->cnt_cval = read_sysreg_el0(cntv_cval);
+ break;
+ case NR_KVM_TIMERS:
+ BUG();
}
- /* Disable the virtual timer */
- write_sysreg_el0(0, cntv_ctl);
- isb();
+ trace_kvm_timer_save_state(ctx);
- vtimer->loaded = false;
+ ctx->loaded = false;
out:
local_irq_restore(flags);
}
@@ -368,67 +394,72 @@ out:
* thread is removed from its waitqueue and made runnable when there's a timer
* interrupt to handle.
*/
-void kvm_timer_schedule(struct kvm_vcpu *vcpu)
+static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
- vtimer_save_state(vcpu);
+ get_timer_map(vcpu, &map);
/*
- * No need to schedule a background timer if any guest timer has
- * already expired, because kvm_vcpu_block will return before putting
- * the thread to sleep.
- */
- if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer))
- return;
-
- /*
- * If both timers are not capable of raising interrupts (disabled or
+ * If no timers are capable of raising interrupts (disabled or
* masked), then there's no more work for us to do.
*/
- if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer))
+ if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
+ !kvm_timer_irq_can_fire(map.direct_ptimer) &&
+ !kvm_timer_irq_can_fire(map.emul_ptimer))
return;
/*
- * The guest timers have not yet expired, schedule a background timer.
+ * At least one guest time will expire. Schedule a background timer.
* Set the earliest expiration time among the guest timers.
*/
soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
}
-static void vtimer_restore_state(struct kvm_vcpu *vcpu)
+static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+
+ soft_timer_cancel(&timer->bg_timer);
+}
+
+static void timer_restore_state(struct arch_timer_context *ctx)
+{
+ struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
+ enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
unsigned long flags;
+ if (!timer->enabled)
+ return;
+
local_irq_save(flags);
- if (vtimer->loaded)
+ if (ctx->loaded)
goto out;
- if (timer->enabled) {
- write_sysreg_el0(vtimer->cnt_cval, cntv_cval);
+ switch (index) {
+ case TIMER_VTIMER:
+ write_sysreg_el0(ctx->cnt_cval, cntv_cval);
+ isb();
+ write_sysreg_el0(ctx->cnt_ctl, cntv_ctl);
+ break;
+ case TIMER_PTIMER:
+ write_sysreg_el0(ctx->cnt_cval, cntp_cval);
isb();
- write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl);
+ write_sysreg_el0(ctx->cnt_ctl, cntp_ctl);
+ break;
+ case NR_KVM_TIMERS:
+ BUG();
}
- vtimer->loaded = true;
+ trace_kvm_timer_restore_state(ctx);
+
+ ctx->loaded = true;
out:
local_irq_restore(flags);
}
-void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
-{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
- vtimer_restore_state(vcpu);
-
- soft_timer_cancel(&timer->bg_timer, &timer->expired);
-}
-
static void set_cntvoff(u64 cntvoff)
{
u32 low = lower_32_bits(cntvoff);
@@ -444,23 +475,32 @@ static void set_cntvoff(u64 cntvoff)
kvm_call_hyp(__kvm_timer_set_cntvoff, low, high);
}
-static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active)
+static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
{
int r;
- r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active);
+ r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
WARN_ON(r);
}
-static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu)
+static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
{
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- bool phys_active;
+ struct kvm_vcpu *vcpu = ctx->vcpu;
+ bool phys_active = false;
+
+ /*
+ * Update the timer output so that it is likely to match the
+ * state we're about to restore. If the timer expires between
+ * this point and the register restoration, we'll take the
+ * interrupt anyway.
+ */
+ kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);
if (irqchip_in_kernel(vcpu->kvm))
- phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq);
- else
- phys_active = vtimer->irq.level;
- set_vtimer_irq_phys_active(vcpu, phys_active);
+ phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq);
+
+ phys_active |= ctx->irq.level;
+
+ set_timer_irq_phys_active(ctx, phys_active);
}
static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
@@ -468,6 +508,14 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
/*
+ * Update the timer output so that it is likely to match the
+ * state we're about to restore. If the timer expires between
+ * this point and the register restoration, we'll take the
+ * interrupt anyway.
+ */
+ kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);
+
+ /*
* When using a userspace irqchip with the architected timers and a
* host interrupt controller that doesn't support an active state, we
* must still prevent continuously exiting from the guest, and
@@ -485,28 +533,32 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
if (unlikely(!timer->enabled))
return;
- if (static_branch_likely(&has_gic_active_state))
- kvm_timer_vcpu_load_gic(vcpu);
- else
+ get_timer_map(vcpu, &map);
+
+ if (static_branch_likely(&has_gic_active_state)) {
+ kvm_timer_vcpu_load_gic(map.direct_vtimer);
+ if (map.direct_ptimer)
+ kvm_timer_vcpu_load_gic(map.direct_ptimer);
+ } else {
kvm_timer_vcpu_load_nogic(vcpu);
+ }
- set_cntvoff(vtimer->cntvoff);
+ set_cntvoff(map.direct_vtimer->cntvoff);
- vtimer_restore_state(vcpu);
+ kvm_timer_unblocking(vcpu);
- /* Set the background timer for the physical timer emulation. */
- phys_timer_emulate(vcpu);
+ timer_restore_state(map.direct_vtimer);
+ if (map.direct_ptimer)
+ timer_restore_state(map.direct_ptimer);
- /* If the timer fired while we weren't running, inject it now */
- if (kvm_timer_should_fire(ptimer) != ptimer->irq.level)
- kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer);
+ if (map.emul_ptimer)
+ timer_emulate(map.emul_ptimer);
}
bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
@@ -528,15 +580,20 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
if (unlikely(!timer->enabled))
return;
- vtimer_save_state(vcpu);
+ get_timer_map(vcpu, &map);
+
+ timer_save_state(map.direct_vtimer);
+ if (map.direct_ptimer)
+ timer_save_state(map.direct_ptimer);
/*
- * Cancel the physical timer emulation, because the only case where we
+ * Cancel soft timer emulation, because the only case where we
* need it after a vcpu_put is in the context of a sleeping VCPU, and
* in that case we already factor in the deadline for the physical
* timer when scheduling the bg_timer.
@@ -544,7 +601,11 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
* In any case, we re-schedule the hrtimer for the physical timer when
* coming back to the VCPU thread in kvm_timer_vcpu_load().
*/
- soft_timer_cancel(&timer->phys_timer, NULL);
+ if (map.emul_ptimer)
+ soft_timer_cancel(&map.emul_ptimer->hrtimer);
+
+ if (swait_active(kvm_arch_vcpu_wq(vcpu)))
+ kvm_timer_blocking(vcpu);
/*
* The kernel may decide to run userspace after calling vcpu_put, so
@@ -553,8 +614,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
* counter of non-VHE case. For VHE, the virtual counter uses a fixed
* virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
*/
- if (!has_vhe())
- set_cntvoff(0);
+ set_cntvoff(0);
}
/*
@@ -569,7 +629,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
if (!kvm_timer_should_fire(vtimer)) {
kvm_timer_update_irq(vcpu, false, vtimer);
if (static_branch_likely(&has_gic_active_state))
- set_vtimer_irq_phys_active(vcpu, false);
+ set_timer_irq_phys_active(vtimer, false);
else
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}
@@ -577,7 +637,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
if (unlikely(!timer->enabled))
return;
@@ -588,9 +648,10 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
+
+ get_timer_map(vcpu, &map);
/*
* The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -598,12 +659,22 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
* resets the timer to be disabled and unmasked and is compliant with
* the ARMv7 architecture.
*/
- vtimer->cnt_ctl = 0;
- ptimer->cnt_ctl = 0;
- kvm_timer_update_state(vcpu);
+ vcpu_vtimer(vcpu)->cnt_ctl = 0;
+ vcpu_ptimer(vcpu)->cnt_ctl = 0;
- if (timer->enabled && irqchip_in_kernel(vcpu->kvm))
- kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq);
+ if (timer->enabled) {
+ kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu));
+ kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu));
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq);
+ if (map.direct_ptimer)
+ kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq);
+ }
+ }
+
+ if (map.emul_ptimer)
+ soft_timer_cancel(&map.emul_ptimer->hrtimer);
return 0;
}
@@ -629,57 +700,71 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff)
void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
/* Synchronize cntvoff across all vtimers of a VM. */
update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
- vcpu_ptimer(vcpu)->cntvoff = 0;
+ ptimer->cntvoff = 0;
- INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
timer->bg_timer.function = kvm_bg_timer_expire;
- hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- timer->phys_timer.function = kvm_phys_timer_expire;
+ hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ vtimer->hrtimer.function = kvm_hrtimer_expire;
+ ptimer->hrtimer.function = kvm_hrtimer_expire;
vtimer->irq.irq = default_vtimer_irq.irq;
ptimer->irq.irq = default_ptimer_irq.irq;
+
+ vtimer->host_timer_irq = host_vtimer_irq;
+ ptimer->host_timer_irq = host_ptimer_irq;
+
+ vtimer->host_timer_irq_flags = host_vtimer_irq_flags;
+ ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
+
+ vtimer->vcpu = vcpu;
+ ptimer->vcpu = vcpu;
}
static void kvm_timer_init_interrupt(void *info)
{
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
+ enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
}
int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
{
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ struct arch_timer_context *timer;
switch (regid) {
case KVM_REG_ARM_TIMER_CTL:
- vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
+ timer = vcpu_vtimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
break;
case KVM_REG_ARM_TIMER_CNT:
+ timer = vcpu_vtimer(vcpu);
update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value);
break;
case KVM_REG_ARM_TIMER_CVAL:
- vtimer->cnt_cval = value;
+ timer = vcpu_vtimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
break;
case KVM_REG_ARM_PTIMER_CTL:
- ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT;
+ timer = vcpu_ptimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
break;
case KVM_REG_ARM_PTIMER_CVAL:
- ptimer->cnt_cval = value;
+ timer = vcpu_ptimer(vcpu);
+ kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
break;
default:
return -1;
}
- kvm_timer_update_state(vcpu);
return 0;
}
@@ -699,26 +784,113 @@ static u64 read_timer_ctl(struct arch_timer_context *timer)
u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
{
- struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
-
switch (regid) {
case KVM_REG_ARM_TIMER_CTL:
- return read_timer_ctl(vtimer);
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CTL);
case KVM_REG_ARM_TIMER_CNT:
- return kvm_phys_timer_read() - vtimer->cntvoff;
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CNT);
case KVM_REG_ARM_TIMER_CVAL:
- return vtimer->cnt_cval;
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CVAL);
case KVM_REG_ARM_PTIMER_CTL:
- return read_timer_ctl(ptimer);
- case KVM_REG_ARM_PTIMER_CVAL:
- return ptimer->cnt_cval;
+ return kvm_arm_timer_read(vcpu,
+ vcpu_ptimer(vcpu), TIMER_REG_CTL);
case KVM_REG_ARM_PTIMER_CNT:
- return kvm_phys_timer_read();
+ return kvm_arm_timer_read(vcpu,
+ vcpu_vtimer(vcpu), TIMER_REG_CNT);
+ case KVM_REG_ARM_PTIMER_CVAL:
+ return kvm_arm_timer_read(vcpu,
+ vcpu_ptimer(vcpu), TIMER_REG_CVAL);
}
return (u64)-1;
}
+static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg)
+{
+ u64 val;
+
+ switch (treg) {
+ case TIMER_REG_TVAL:
+ val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
+ break;
+
+ case TIMER_REG_CTL:
+ val = read_timer_ctl(timer);
+ break;
+
+ case TIMER_REG_CVAL:
+ val = timer->cnt_cval;
+ break;
+
+ case TIMER_REG_CNT:
+ val = kvm_phys_timer_read() - timer->cntvoff;
+ break;
+
+ default:
+ BUG();
+ }
+
+ return val;
+}
+
+u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
+ enum kvm_arch_timers tmr,
+ enum kvm_arch_timer_regs treg)
+{
+ u64 val;
+
+ preempt_disable();
+ kvm_timer_vcpu_put(vcpu);
+
+ val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+
+ kvm_timer_vcpu_load(vcpu);
+ preempt_enable();
+
+ return val;
+}
+
+static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
+ struct arch_timer_context *timer,
+ enum kvm_arch_timer_regs treg,
+ u64 val)
+{
+ switch (treg) {
+ case TIMER_REG_TVAL:
+ timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val;
+ break;
+
+ case TIMER_REG_CTL:
+ timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT;
+ break;
+
+ case TIMER_REG_CVAL:
+ timer->cnt_cval = val;
+ break;
+
+ default:
+ BUG();
+ }
+}
+
+void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
+ enum kvm_arch_timers tmr,
+ enum kvm_arch_timer_regs treg,
+ u64 val)
+{
+ preempt_disable();
+ kvm_timer_vcpu_put(vcpu);
+
+ kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
+
+ kvm_timer_vcpu_load(vcpu);
+ preempt_enable();
+}
+
static int kvm_timer_starting_cpu(unsigned int cpu)
{
kvm_timer_init_interrupt(NULL);
@@ -744,6 +916,8 @@ int kvm_timer_hyp_init(bool has_gic)
return -ENODEV;
}
+ /* First, do the virtual EL1 timer irq */
+
if (info->virtual_irq <= 0) {
kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
info->virtual_irq);
@@ -754,15 +928,15 @@ int kvm_timer_hyp_init(bool has_gic)
host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq);
if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH &&
host_vtimer_irq_flags != IRQF_TRIGGER_LOW) {
- kvm_err("Invalid trigger for IRQ%d, assuming level low\n",
+ kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n",
host_vtimer_irq);
host_vtimer_irq_flags = IRQF_TRIGGER_LOW;
}
err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
- "kvm guest timer", kvm_get_running_vcpus());
+ "kvm guest vtimer", kvm_get_running_vcpus());
if (err) {
- kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n",
+ kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
host_vtimer_irq, err);
return err;
}
@@ -780,6 +954,43 @@ int kvm_timer_hyp_init(bool has_gic)
kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);
+ /* Now let's do the physical EL1 timer irq */
+
+ if (info->physical_irq > 0) {
+ host_ptimer_irq = info->physical_irq;
+ host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq);
+ if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH &&
+ host_ptimer_irq_flags != IRQF_TRIGGER_LOW) {
+ kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n",
+ host_ptimer_irq);
+ host_ptimer_irq_flags = IRQF_TRIGGER_LOW;
+ }
+
+ err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
+ "kvm guest ptimer", kvm_get_running_vcpus());
+ if (err) {
+ kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
+ host_ptimer_irq, err);
+ return err;
+ }
+
+ if (has_gic) {
+ err = irq_set_vcpu_affinity(host_ptimer_irq,
+ kvm_get_running_vcpus());
+ if (err) {
+ kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
+ goto out_free_irq;
+ }
+ }
+
+ kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
+ } else if (has_vhe()) {
+ kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
+ info->physical_irq);
+ err = -ENODEV;
+ goto out_free_irq;
+ }
+
cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
"kvm/arm/timer:starting", kvm_timer_starting_cpu,
kvm_timer_dying_cpu);
@@ -791,12 +1002,9 @@ out_free_irq:
void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
- soft_timer_cancel(&timer->bg_timer, &timer->expired);
- soft_timer_cancel(&timer->phys_timer, NULL);
- kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
+ soft_timer_cancel(&timer->bg_timer);
}
static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
@@ -830,16 +1038,18 @@ bool kvm_arch_timer_get_input_level(int vintid)
if (vintid == vcpu_vtimer(vcpu)->irq.irq)
timer = vcpu_vtimer(vcpu);
+ else if (vintid == vcpu_ptimer(vcpu)->irq.irq)
+ timer = vcpu_ptimer(vcpu);
else
- BUG(); /* We only map the vtimer so far */
+ BUG();
return kvm_timer_should_fire(timer);
}
int kvm_timer_enable(struct kvm_vcpu *vcpu)
{
- struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
+ struct arch_timer_cpu *timer = vcpu_timer(vcpu);
+ struct timer_map map;
int ret;
if (timer->enabled)
@@ -857,19 +1067,33 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu)
return -EINVAL;
}
- ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq,
+ get_timer_map(vcpu, &map);
+
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map.direct_vtimer->host_timer_irq,
+ map.direct_vtimer->irq.irq,
kvm_arch_timer_get_input_level);
if (ret)
return ret;
+ if (map.direct_ptimer) {
+ ret = kvm_vgic_map_phys_irq(vcpu,
+ map.direct_ptimer->host_timer_irq,
+ map.direct_ptimer->irq.irq,
+ kvm_arch_timer_get_input_level);
+ }
+
+ if (ret)
+ return ret;
+
no_vgic:
timer->enabled = 1;
return 0;
}
/*
- * On VHE system, we only need to configure trap on physical timer and counter
- * accesses in EL0 and EL1 once, not for every world switch.
+ * On VHE system, we only need to configure the EL2 timer trap register once,
+ * not for every world switch.
* The host kernel runs at EL2 with HCR_EL2.TGE == 1,
* and this makes those bits have no effect for the host kernel execution.
*/
@@ -880,11 +1104,11 @@ void kvm_timer_init_vhe(void)
u64 val;
/*
- * Disallow physical timer access for the guest.
- * Physical counter access is allowed.
+ * VHE systems allow the guest direct access to the EL1 physical
+ * timer/counter.
*/
val = read_sysreg(cnthctl_el2);
- val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift);
+ val |= (CNTHCTL_EL1PCEN << cnthctl_shift);
val |= (CNTHCTL_EL1PCTEN << cnthctl_shift);
write_sysreg(val, cnthctl_el2);
}
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 23774970c9df..f412ebc90610 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -65,8 +65,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
/* The VMID used in the VTTBR */
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
static u32 kvm_next_vmid;
-static unsigned int kvm_vmid_bits __read_mostly;
-static DEFINE_RWLOCK(kvm_vmid_lock);
+static DEFINE_SPINLOCK(kvm_vmid_lock);
static bool vgic_present;
@@ -142,7 +141,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_vgic_early_init(kvm);
/* Mark the initial VMID generation invalid */
- kvm->arch.vmid_gen = 0;
+ kvm->arch.vmid.vmid_gen = 0;
/* The maximum number of VCPUs is limited by the host's GIC model */
kvm->arch.max_vcpus = vgic_present ?
@@ -336,13 +335,11 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- kvm_timer_schedule(vcpu);
kvm_vgic_v4_enable_doorbell(vcpu);
}
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- kvm_timer_unschedule(vcpu);
kvm_vgic_v4_disable_doorbell(vcpu);
}
@@ -472,51 +469,42 @@ void force_vm_exit(const cpumask_t *mask)
/**
* need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to check
+ * @vmid: The VMID to check
*
* return true if there is a new generation of VMIDs being used
*
- * The hardware supports only 256 values with the value zero reserved for the
- * host, so we check if an assigned value belongs to a previous generation,
- * which which requires us to assign a new value. If we're the first to use a
- * VMID for the new generation, we must flush necessary caches and TLBs on all
- * CPUs.
+ * The hardware supports a limited set of values with the value zero reserved
+ * for the host, so we check if an assigned value belongs to a previous
+ * generation, which which requires us to assign a new value. If we're the
+ * first to use a VMID for the new generation, we must flush necessary caches
+ * and TLBs on all CPUs.
*/
-static bool need_new_vmid_gen(struct kvm *kvm)
+static bool need_new_vmid_gen(struct kvm_vmid *vmid)
{
- return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
+ u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
+ smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
+ return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
}
/**
- * update_vttbr - Update the VTTBR with a valid VMID before the guest runs
- * @kvm The guest that we are about to run
- *
- * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the
- * VM has a valid VMID, otherwise assigns a new one and flushes corresponding
- * caches and TLBs.
+ * update_vmid - Update the vmid with a valid VMID for the current generation
+ * @kvm: The guest that struct vmid belongs to
+ * @vmid: The stage-2 VMID information struct
*/
-static void update_vttbr(struct kvm *kvm)
+static void update_vmid(struct kvm_vmid *vmid)
{
- phys_addr_t pgd_phys;
- u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0;
- bool new_gen;
-
- read_lock(&kvm_vmid_lock);
- new_gen = need_new_vmid_gen(kvm);
- read_unlock(&kvm_vmid_lock);
-
- if (!new_gen)
+ if (!need_new_vmid_gen(vmid))
return;
- write_lock(&kvm_vmid_lock);
+ spin_lock(&kvm_vmid_lock);
/*
* We need to re-check the vmid_gen here to ensure that if another vcpu
* already allocated a valid vmid for this vm, then this vcpu should
* use the same vmid.
*/
- if (!need_new_vmid_gen(kvm)) {
- write_unlock(&kvm_vmid_lock);
+ if (!need_new_vmid_gen(vmid)) {
+ spin_unlock(&kvm_vmid_lock);
return;
}
@@ -539,18 +527,14 @@ static void update_vttbr(struct kvm *kvm)
kvm_call_hyp(__kvm_flush_vm_context);
}
- kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
- kvm->arch.vmid = kvm_next_vmid;
+ vmid->vmid = kvm_next_vmid;
kvm_next_vmid++;
- kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
+ kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
- /* update vttbr to be used with the new vmid */
- pgd_phys = virt_to_phys(kvm->arch.pgd);
- BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm));
- vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
- kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp;
+ smp_wmb();
+ WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen));
- write_unlock(&kvm_vmid_lock);
+ spin_unlock(&kvm_vmid_lock);
}
static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
@@ -627,6 +611,13 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
/* Awaken to handle a signal, request we sleep again later. */
kvm_make_request(KVM_REQ_SLEEP, vcpu);
}
+
+ /*
+ * Make sure we will observe a potential reset request if we've
+ * observed a change to the power state. Pairs with the smp_wmb() in
+ * kvm_psci_vcpu_on().
+ */
+ smp_rmb();
}
static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@@ -640,6 +631,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
vcpu_req_sleep(vcpu);
+ if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+ kvm_reset_vcpu(vcpu);
+
/*
* Clear IRQ_PENDING requests that were made to guarantee
* that a VCPU sees new virtual interrupts.
@@ -674,8 +668,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
ret = kvm_handle_mmio_return(vcpu, vcpu->run);
if (ret)
return ret;
- if (kvm_arm_handle_step_debug(vcpu, vcpu->run))
- return 0;
}
if (run->immediate_exit)
@@ -693,7 +685,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
*/
cond_resched();
- update_vttbr(vcpu->kvm);
+ update_vmid(&vcpu->kvm->arch.vmid);
check_vcpu_requests(vcpu);
@@ -742,7 +734,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
*/
smp_store_mb(vcpu->mode, IN_GUEST_MODE);
- if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
+ if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) ||
kvm_request_pending(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
isb(); /* Ensure work in x_flush_hwstate is committed */
@@ -768,7 +760,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
ret = kvm_vcpu_run_vhe(vcpu);
kvm_arm_vhe_guest_exit();
} else {
- ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu);
+ ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu);
}
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -942,7 +934,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
const struct kvm_vcpu_init *init)
{
- unsigned int i;
+ unsigned int i, ret;
int phys_target = kvm_target_cpu();
if (init->target != phys_target)
@@ -977,9 +969,14 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
vcpu->arch.target = phys_target;
/* Now we know what it is, we can reset it. */
- return kvm_reset_vcpu(vcpu);
-}
+ ret = kvm_reset_vcpu(vcpu);
+ if (ret) {
+ vcpu->arch.target = -1;
+ bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
+ }
+ return ret;
+}
static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
struct kvm_vcpu_init *init)
@@ -1205,14 +1202,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
*/
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- bool is_dirty = false;
+ bool flush = false;
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+
+ r = kvm_get_dirty_log_protect(kvm, log, &flush);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
+{
+ bool flush = false;
int r;
mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
+ r = kvm_clear_dirty_log_protect(kvm, log, &flush);
- if (is_dirty)
+ if (flush)
kvm_flush_remote_tlbs(kvm);
mutex_unlock(&kvm->slots_lock);
@@ -1404,10 +1417,6 @@ static inline void hyp_cpu_pm_exit(void)
static int init_common_resources(void)
{
- /* set size of VMID supported by CPU */
- kvm_vmid_bits = kvm_get_vmid_bits();
- kvm_info("%d-bit VMID\n", kvm_vmid_bits);
-
kvm_set_ipa_limit();
return 0;
@@ -1548,6 +1557,7 @@ static int init_hyp_mode(void)
kvm_cpu_context_t *cpu_ctxt;
cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu);
+ kvm_init_host_cpu_context(cpu_ctxt, cpu);
err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
if (err) {
@@ -1558,7 +1568,7 @@ static int init_hyp_mode(void)
err = hyp_map_aux_data();
if (err)
- kvm_err("Cannot map host auxilary data: %d\n", err);
+ kvm_err("Cannot map host auxiliary data: %d\n", err);
return 0;
@@ -1640,8 +1650,10 @@ int kvm_arch_init(void *opaque)
return -ENODEV;
}
- if (!kvm_arch_check_sve_has_vhe()) {
- kvm_pr_unimpl("SVE system without VHE unsupported. Broken cpu?");
+ in_hyp_mode = is_kernel_in_hyp_mode();
+
+ if (!in_hyp_mode && kvm_arch_requires_vhe()) {
+ kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n");
return -ENODEV;
}
@@ -1657,8 +1669,6 @@ int kvm_arch_init(void *opaque)
if (err)
return err;
- in_hyp_mode = is_kernel_in_hyp_mode();
-
if (!in_hyp_mode) {
err = init_hyp_mode();
if (err)
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 616e5a433ab0..370bd6c5e6cb 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -222,11 +222,11 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
}
}
- if (used_lrs) {
+ if (used_lrs || cpu_if->its_vpe.its_vm) {
int i;
u32 elrsr;
- elrsr = read_gicreg(ICH_ELSR_EL2);
+ elrsr = read_gicreg(ICH_ELRSR_EL2);
write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2);
@@ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs;
int i;
- if (used_lrs) {
+ if (used_lrs || cpu_if->its_vpe.its_vm) {
write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
for (i = 0; i < used_lrs; i++)
@@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
esr = kvm_vcpu_get_hsr(vcpu);
if (vcpu_mode_is_32bit(vcpu)) {
- if (!kvm_condition_valid(vcpu))
+ if (!kvm_condition_valid(vcpu)) {
+ __kvm_skip_instr(vcpu);
return 1;
+ }
sysreg = esr_cp15_to_sysreg(esr);
} else {
@@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
rt = kvm_vcpu_sys_get_rt(vcpu);
fn(vcpu, vmcr, rt);
+ __kvm_skip_instr(vcpu);
+
return 1;
}
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index dac7ceb1a677..08443a15e6be 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
}
+ /*
+ * The MMIO instruction is emulated and should not be re-executed
+ * in the guest.
+ */
+ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
return 0;
}
@@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
vcpu->arch.mmio_decode.sign_extend = sign_extend;
vcpu->arch.mmio_decode.rt = rt;
- /*
- * The MMIO instruction is emulated and should not be re-executed
- * in the guest.
- */
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
return 0;
}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 5eca48bdb1a6..a39dcfdbcc65 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -27,10 +27,10 @@
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_mmio.h>
+#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
-#include <asm/system_misc.h>
#include "trace.h"
@@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn)
* @addr: IPA
* @pmd: pmd pointer for IPA
*
- * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
- * pages in the range dirty.
+ * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs.
*/
static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
{
@@ -115,6 +114,24 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
put_page(virt_to_page(pmd));
}
+/**
+ * stage2_dissolve_pud() - clear and flush huge PUD entry
+ * @kvm: pointer to kvm structure.
+ * @addr: IPA
+ * @pud: pud pointer for IPA
+ *
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs.
+ */
+static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
+{
+ if (!stage2_pud_huge(kvm, *pudp))
+ return;
+
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ put_page(virt_to_page(pudp));
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
{
@@ -607,7 +624,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
addr = start;
do {
pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, pfn_pte(pfn, prot));
+ kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
get_page(virt_to_page(pte));
pfn++;
} while (addr += PAGE_SIZE, addr != end);
@@ -628,7 +645,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
BUG_ON(pmd_sect(*pmd));
if (pmd_none(*pmd)) {
- pte = pte_alloc_one_kernel(NULL, addr);
+ pte = pte_alloc_one_kernel(NULL);
if (!pte) {
kvm_err("Cannot allocate Hyp pte\n");
return -ENOMEM;
@@ -880,15 +897,15 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
* kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
* @kvm: The KVM struct pointer for the VM.
*
- * Allocates only the stage-2 HW PGD level table(s) (can support either full
- * 40-bit input addresses or limited to 32-bit input addresses). Clears the
- * allocated pages.
+ * Allocates only the stage-2 HW PGD level table(s) of size defined by
+ * stage2_pgd_size(kvm).
*
* Note we don't need locking here as this is only called when the VM is
* created, which can only be done once.
*/
int kvm_alloc_stage2_pgd(struct kvm *kvm)
{
+ phys_addr_t pgd_phys;
pgd_t *pgd;
if (kvm->arch.pgd != NULL) {
@@ -901,7 +918,12 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
if (!pgd)
return -ENOMEM;
+ pgd_phys = virt_to_phys(pgd);
+ if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)))
+ return -EINVAL;
+
kvm->arch.pgd = pgd;
+ kvm->arch.pgd_phys = pgd_phys;
return 0;
}
@@ -989,6 +1011,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm)
unmap_stage2_range(kvm, 0, kvm_phys_size(kvm));
pgd = READ_ONCE(kvm->arch.pgd);
kvm->arch.pgd = NULL;
+ kvm->arch.pgd_phys = 0;
}
spin_unlock(&kvm->mmu_lock);
@@ -1022,7 +1045,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
pmd_t *pmd;
pud = stage2_get_pud(kvm, cache, addr);
- if (!pud)
+ if (!pud || stage2_pud_huge(kvm, *pud))
return NULL;
if (stage2_pud_none(kvm, *pud)) {
@@ -1041,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
{
pmd_t *pmd, old_pmd;
+retry:
pmd = stage2_get_pmd(kvm, cache, addr);
VM_BUG_ON(!pmd);
old_pmd = *pmd;
+ /*
+ * Multiple vcpus faulting on the same PMD entry, can
+ * lead to them sequentially updating the PMD with the
+ * same value. Following the break-before-make
+ * (pmd_clear() followed by tlb_flush()) process can
+ * hinder forward progress due to refaults generated
+ * on missing translations.
+ *
+ * Skip updating the page table if the entry is
+ * unchanged.
+ */
+ if (pmd_val(old_pmd) == pmd_val(*new_pmd))
+ return 0;
+
if (pmd_present(old_pmd)) {
/*
- * Multiple vcpus faulting on the same PMD entry, can
- * lead to them sequentially updating the PMD with the
- * same value. Following the break-before-make
- * (pmd_clear() followed by tlb_flush()) process can
- * hinder forward progress due to refaults generated
- * on missing translations.
+ * If we already have PTE level mapping for this block,
+ * we must unmap it to avoid inconsistent TLB state and
+ * leaking the table page. We could end up in this situation
+ * if the memory slot was marked for dirty logging and was
+ * reverted, leaving PTE level mappings for the pages accessed
+ * during the period. So, unmap the PTE level mapping for this
+ * block and retry, as we could have released the upper level
+ * table in the process.
*
- * Skip updating the page table if the entry is
- * unchanged.
+ * Normal THP split/merge follows mmu_notifier callbacks and do
+ * get handled accordingly.
*/
- if (pmd_val(old_pmd) == pmd_val(*new_pmd))
- return 0;
-
+ if (!pmd_thp_or_huge(old_pmd)) {
+ unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE);
+ goto retry;
+ }
/*
* Mapping in huge pages should only happen through a
* fault. If a page is merged into a transparent huge
@@ -1071,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
* should become splitting first, unmapped, merged,
* and mapped back in on-demand.
*/
- VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
-
+ WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd));
pmd_clear(pmd);
kvm_tlb_flush_vmid_ipa(kvm, addr);
} else {
@@ -1083,29 +1123,113 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
return 0;
}
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr, const pud_t *new_pudp)
+{
+ pud_t *pudp, old_pud;
+
+retry:
+ pudp = stage2_get_pud(kvm, cache, addr);
+ VM_BUG_ON(!pudp);
+
+ old_pud = *pudp;
+
+ /*
+ * A large number of vcpus faulting on the same stage 2 entry,
+ * can lead to a refault due to the stage2_pud_clear()/tlb_flush().
+ * Skip updating the page tables if there is no change.
+ */
+ if (pud_val(old_pud) == pud_val(*new_pudp))
+ return 0;
+
+ if (stage2_pud_present(kvm, old_pud)) {
+ /*
+ * If we already have table level mapping for this block, unmap
+ * the range for this block and retry.
+ */
+ if (!stage2_pud_huge(kvm, old_pud)) {
+ unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE);
+ goto retry;
+ }
+
+ WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp));
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pudp));
+ }
+
+ kvm_set_pud(pudp, *new_pudp);
+ return 0;
+}
+
+/*
+ * stage2_get_leaf_entry - walk the stage2 VM page tables and return
+ * true if a valid and present leaf-entry is found. A pointer to the
+ * leaf-entry is returned in the appropriate level variable - pudpp,
+ * pmdpp, ptepp.
+ */
+static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
+ pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
{
+ pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
- pmdp = stage2_get_pmd(kvm, NULL, addr);
+ *pudpp = NULL;
+ *pmdpp = NULL;
+ *ptepp = NULL;
+
+ pudp = stage2_get_pud(kvm, NULL, addr);
+ if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
+ return false;
+
+ if (stage2_pud_huge(kvm, *pudp)) {
+ *pudpp = pudp;
+ return true;
+ }
+
+ pmdp = stage2_pmd_offset(kvm, pudp, addr);
if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
return false;
- if (pmd_thp_or_huge(*pmdp))
- return kvm_s2pmd_exec(pmdp);
+ if (pmd_thp_or_huge(*pmdp)) {
+ *pmdpp = pmdp;
+ return true;
+ }
ptep = pte_offset_kernel(pmdp, addr);
if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
return false;
- return kvm_s2pte_exec(ptep);
+ *ptepp = ptep;
+ return true;
+}
+
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ bool found;
+
+ found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
+ if (!found)
+ return false;
+
+ if (pudp)
+ return kvm_s2pud_exec(pudp);
+ else if (pmdp)
+ return kvm_s2pmd_exec(pmdp);
+ else
+ return kvm_s2pte_exec(ptep);
}
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr, const pte_t *new_pte,
unsigned long flags)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte, old_pte;
bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
@@ -1114,7 +1238,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
VM_BUG_ON(logging_active && !cache);
/* Create stage-2 page table mapping - Levels 0 and 1 */
- pmd = stage2_get_pmd(kvm, cache, addr);
+ pud = stage2_get_pud(kvm, cache, addr);
+ if (!pud) {
+ /*
+ * Ignore calls from kvm_set_spte_hva for unallocated
+ * address ranges.
+ */
+ return 0;
+ }
+
+ /*
+ * While dirty page logging - dissolve huge PUD, then continue
+ * on to allocate page.
+ */
+ if (logging_active)
+ stage2_dissolve_pud(kvm, addr, pud);
+
+ if (stage2_pud_none(kvm, *pud)) {
+ if (!cache)
+ return 0; /* ignore calls from kvm_set_spte_hva */
+ pmd = mmu_memory_cache_alloc(cache);
+ stage2_pud_populate(kvm, pud, pmd);
+ get_page(virt_to_page(pud));
+ }
+
+ pmd = stage2_pmd_offset(kvm, pud, addr);
if (!pmd) {
/*
* Ignore calls from kvm_set_spte_hva for unallocated
@@ -1182,6 +1330,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
return stage2_ptep_test_and_clear_young((pte_t *)pmd);
}
+static int stage2_pudp_test_and_clear_young(pud_t *pud)
+{
+ return stage2_ptep_test_and_clear_young((pte_t *)pud);
+}
+
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
*
@@ -1202,7 +1355,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
pfn = __phys_to_pfn(pa);
for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
+ pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
if (writable)
pte = kvm_s2pte_mkwrite(pte);
@@ -1234,7 +1387,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
struct page *page = pfn_to_page(pfn);
/*
- * PageTransCompoungMap() returns true for THP and
+ * PageTransCompoundMap() returns true for THP and
* hugetlbfs. Make sure the adjustment is done only for THP
* pages.
*/
@@ -1274,14 +1427,6 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
return false;
}
-static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
-{
- if (kvm_vcpu_trap_is_iabt(vcpu))
- return false;
-
- return kvm_vcpu_dabt_iswrite(vcpu);
-}
-
/**
* stage2_wp_ptes - write protect PMD range
* @pmd: pointer to pmd entry
@@ -1330,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud,
}
/**
- * stage2_wp_puds - write protect PGD range
- * @pgd: pointer to pgd entry
- * @addr: range start address
- * @end: range end address
- *
- * Process PUD entries, for a huge PUD we cause a panic.
- */
+ * stage2_wp_puds - write protect PGD range
+ * @pgd: pointer to pgd entry
+ * @addr: range start address
+ * @end: range end address
+ */
static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
phys_addr_t addr, phys_addr_t end)
{
@@ -1347,9 +1490,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
do {
next = stage2_pud_addr_end(kvm, addr, end);
if (!stage2_pud_none(kvm, *pud)) {
- /* TODO:PUD not supported, revisit later if supported */
- BUG_ON(stage2_pud_huge(kvm, *pud));
- stage2_wp_pmds(kvm, pud, addr, next);
+ if (stage2_pud_huge(kvm, *pud)) {
+ if (!kvm_s2pud_readonly(pud))
+ kvm_set_s2pud_readonly(pud);
+ } else {
+ stage2_wp_pmds(kvm, pud, addr, next);
+ }
}
} while (pud++, addr = next, addr != end);
}
@@ -1392,7 +1538,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
*
* Called to start logging dirty pages after memory region
* KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PMD and PTEs are write protected in the memory region.
+ * all present PUD, PMD and PTEs are write protected in the memory region.
* Afterwards read of dirty page log can be called.
*
* Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
@@ -1470,12 +1616,70 @@ static void kvm_send_hwpoison_signal(unsigned long address,
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
}
+static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
+ unsigned long hva,
+ unsigned long map_size)
+{
+ gpa_t gpa_start;
+ hva_t uaddr_start, uaddr_end;
+ size_t size;
+
+ size = memslot->npages * PAGE_SIZE;
+
+ gpa_start = memslot->base_gfn << PAGE_SHIFT;
+
+ uaddr_start = memslot->userspace_addr;
+ uaddr_end = uaddr_start + size;
+
+ /*
+ * Pages belonging to memslots that don't have the same alignment
+ * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
+ * PMD/PUD entries, because we'll end up mapping the wrong pages.
+ *
+ * Consider a layout like the following:
+ *
+ * memslot->userspace_addr:
+ * +-----+--------------------+--------------------+---+
+ * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz|
+ * +-----+--------------------+--------------------+---+
+ *
+ * memslot->base_gfn << PAGE_SIZE:
+ * +---+--------------------+--------------------+-----+
+ * |abc|def Stage-2 block | Stage-2 block |tvxyz|
+ * +---+--------------------+--------------------+-----+
+ *
+ * If we create those stage-2 blocks, we'll end up with this incorrect
+ * mapping:
+ * d -> f
+ * e -> g
+ * f -> h
+ */
+ if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
+ return false;
+
+ /*
+ * Next, let's make sure we're not trying to map anything not covered
+ * by the memslot. This means we have to prohibit block size mappings
+ * for the beginning and end of a non-block aligned and non-block sized
+ * memory slot (illustrated by the head and tail parts of the
+ * userspace view above containing pages 'abcde' and 'xyz',
+ * respectively).
+ *
+ * Note that it doesn't matter if we do the check using the
+ * userspace_addr or the base_gfn, as both are equally aligned (per
+ * the check above) and equally sized.
+ */
+ return (hva & ~(map_size - 1)) >= uaddr_start &&
+ (hva & ~(map_size - 1)) + map_size <= uaddr_end;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
{
int ret;
- bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false;
+ bool write_fault, writable, force_pte = false;
+ bool exec_fault, needs_exec;
unsigned long mmu_seq;
gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
@@ -1484,7 +1688,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long flags = 0;
+ unsigned long vma_pagesize, flags = 0;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1504,23 +1708,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) {
- hugetlb = true;
- gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
- } else {
- /*
- * Pages belonging to memslots that don't have the same
- * alignment for userspace and IPA cannot be mapped using
- * block descriptors even if the pages belong to a THP for
- * the process, because the stage-2 block descriptor will
- * cover more than a single THP and we loose atomicity for
- * unmapping, updates, and splits of the THP or other pages
- * in the stage-2 block range.
- */
- if ((memslot->userspace_addr & ~PMD_MASK) !=
- ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
- force_pte = true;
+ vma_pagesize = vma_kernel_pagesize(vma);
+ if (logging_active ||
+ !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
+ force_pte = true;
+ vma_pagesize = PAGE_SIZE;
}
+
+ /*
+ * The stage2 has a minimum of 2 level table (For arm64 see
+ * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
+ * use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
+ * As for PUD huge maps, we must make sure that we have at least
+ * 3 levels, i.e, PMD is not folded.
+ */
+ if (vma_pagesize == PMD_SIZE ||
+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm)))
+ gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
up_read(&current->mm->mmap_sem);
/* We need minimum second+third level pages */
@@ -1558,7 +1762,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* should not be mapped with huge pages (it introduces churn
* and performance degradation), so force a pte mapping.
*/
- force_pte = true;
flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
/*
@@ -1573,50 +1776,73 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
- if (!hugetlb && !force_pte)
- hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+ if (vma_pagesize == PAGE_SIZE && !force_pte) {
+ /*
+ * Only PMD_SIZE transparent hugepages(THP) are
+ * currently supported. This code will need to be
+ * updated to support other THP sizes.
+ *
+ * Make sure the host VA and the guest IPA are sufficiently
+ * aligned and that the block is contained within the memslot.
+ */
+ if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) &&
+ transparent_hugepage_adjust(&pfn, &fault_ipa))
+ vma_pagesize = PMD_SIZE;
+ }
+
+ if (writable)
+ kvm_set_pfn_dirty(pfn);
- if (hugetlb) {
- pmd_t new_pmd = pfn_pmd(pfn, mem_type);
- new_pmd = pmd_mkhuge(new_pmd);
- if (writable) {
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
- kvm_set_pfn_dirty(pfn);
- }
+ if (fault_status != FSC_PERM)
+ clean_dcache_guest_page(pfn, vma_pagesize);
+
+ if (exec_fault)
+ invalidate_icache_guest_page(pfn, vma_pagesize);
+
+ /*
+ * If we took an execution fault we have made the
+ * icache/dcache coherent above and should now let the s2
+ * mapping be executable.
+ *
+ * Write faults (!exec_fault && FSC_PERM) are orthogonal to
+ * execute permissions, and we preserve whatever we have.
+ */
+ needs_exec = exec_fault ||
+ (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
- if (fault_status != FSC_PERM)
- clean_dcache_guest_page(pfn, PMD_SIZE);
+ if (vma_pagesize == PUD_SIZE) {
+ pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
- if (exec_fault) {
+ new_pud = kvm_pud_mkhuge(new_pud);
+ if (writable)
+ new_pud = kvm_s2pud_mkwrite(new_pud);
+
+ if (needs_exec)
+ new_pud = kvm_s2pud_mkexec(new_pud);
+
+ ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
+ } else if (vma_pagesize == PMD_SIZE) {
+ pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
+
+ new_pmd = kvm_pmd_mkhuge(new_pmd);
+
+ if (writable)
+ new_pmd = kvm_s2pmd_mkwrite(new_pmd);
+
+ if (needs_exec)
new_pmd = kvm_s2pmd_mkexec(new_pmd);
- invalidate_icache_guest_page(pfn, PMD_SIZE);
- } else if (fault_status == FSC_PERM) {
- /* Preserve execute if XN was already cleared */
- if (stage2_is_exec(kvm, fault_ipa))
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
- }
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
- pte_t new_pte = pfn_pte(pfn, mem_type);
+ pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
if (writable) {
new_pte = kvm_s2pte_mkwrite(new_pte);
- kvm_set_pfn_dirty(pfn);
mark_page_dirty(kvm, gfn);
}
- if (fault_status != FSC_PERM)
- clean_dcache_guest_page(pfn, PAGE_SIZE);
-
- if (exec_fault) {
+ if (needs_exec)
new_pte = kvm_s2pte_mkexec(new_pte);
- invalidate_icache_guest_page(pfn, PAGE_SIZE);
- } else if (fault_status == FSC_PERM) {
- /* Preserve execute if XN was already cleared */
- if (stage2_is_exec(kvm, fault_ipa))
- new_pte = kvm_s2pte_mkexec(new_pte);
- }
ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
}
@@ -1637,6 +1863,7 @@ out_unlock:
*/
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
kvm_pfn_t pfn;
@@ -1646,24 +1873,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
spin_lock(&vcpu->kvm->mmu_lock);
- pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
goto out;
- if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
+ if (pud) { /* HugeTLB */
+ *pud = kvm_s2pud_mkyoung(*pud);
+ pfn = kvm_pud_pfn(*pud);
+ pfn_valid = true;
+ } else if (pmd) { /* THP, HugeTLB */
*pmd = pmd_mkyoung(*pmd);
pfn = pmd_pfn(*pmd);
pfn_valid = true;
- goto out;
+ } else {
+ *pte = pte_mkyoung(*pte); /* Just a page... */
+ pfn = pte_pfn(*pte);
+ pfn_valid = true;
}
- pte = pte_offset_kernel(pmd, fault_ipa);
- if (pte_none(*pte)) /* Nothing there either */
- goto out;
-
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
out:
spin_unlock(&vcpu->kvm->mmu_lock);
if (pfn_valid)
@@ -1703,7 +1929,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
* For RAS the host kernel may handle this abort.
* There is no need to pass the error into the guest.
*/
- if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+ if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
return 1;
if (unlikely(!is_iabt)) {
@@ -1849,14 +2075,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data
}
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
unsigned long end = hva + PAGE_SIZE;
kvm_pfn_t pfn = pte_pfn(pte);
pte_t stage2_pte;
if (!kvm->arch.pgd)
- return;
+ return 0;
trace_kvm_set_spte_hva(hva);
@@ -1865,48 +2091,46 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = pfn_pte(pfn, PAGE_S2);
+ stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
+
+ return 0;
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
- pmd = stage2_get_pmd(kvm, NULL, gpa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
return 0;
- if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ if (pud)
+ return stage2_pudp_test_and_clear_young(pud);
+ else if (pmd)
return stage2_pmdp_test_and_clear_young(pmd);
-
- pte = pte_offset_kernel(pmd, gpa);
- if (pte_none(*pte))
- return 0;
-
- return stage2_ptep_test_and_clear_young(pte);
+ else
+ return stage2_ptep_test_and_clear_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
- pmd = stage2_get_pmd(kvm, NULL, gpa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
return 0;
- if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ if (pud)
+ return kvm_s2pud_young(*pud);
+ else if (pmd)
return pmd_young(*pmd);
-
- pte = pte_offset_kernel(pmd, gpa);
- if (!pte_none(*pte)) /* Just a page... */
+ else
return pte_young(*pte);
-
- return 0;
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
@@ -2152,7 +2376,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
return 0;
}
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index 9b73d3ad918a..34d08ee63747 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -104,12 +104,10 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
{
+ struct vcpu_reset_state *reset_state;
struct kvm *kvm = source_vcpu->kvm;
struct kvm_vcpu *vcpu = NULL;
- struct swait_queue_head *wq;
unsigned long cpu_id;
- unsigned long context_id;
- phys_addr_t target_pc;
cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
if (vcpu_mode_is_32bit(source_vcpu))
@@ -130,32 +128,30 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
return PSCI_RET_INVALID_PARAMS;
}
- target_pc = smccc_get_arg2(source_vcpu);
- context_id = smccc_get_arg3(source_vcpu);
+ reset_state = &vcpu->arch.reset_state;
- kvm_reset_vcpu(vcpu);
-
- /* Gracefully handle Thumb2 entry point */
- if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
- target_pc &= ~((phys_addr_t) 1);
- vcpu_set_thumb(vcpu);
- }
+ reset_state->pc = smccc_get_arg2(source_vcpu);
/* Propagate caller endianness */
- if (kvm_vcpu_is_be(source_vcpu))
- kvm_vcpu_set_be(vcpu);
+ reset_state->be = kvm_vcpu_is_be(source_vcpu);
- *vcpu_pc(vcpu) = target_pc;
/*
* NOTE: We always update r0 (or x0) because for PSCI v0.1
* the general puspose registers are undefined upon CPU_ON.
*/
- smccc_set_retval(vcpu, context_id, 0, 0, 0);
- vcpu->arch.power_off = false;
- smp_mb(); /* Make sure the above is visible */
+ reset_state->r0 = smccc_get_arg3(source_vcpu);
+
+ WRITE_ONCE(reset_state->reset, true);
+ kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
- wq = kvm_arch_vcpu_wq(vcpu);
- swake_up_one(wq);
+ /*
+ * Make sure the reset request is observed if the change to
+ * power_state is observed.
+ */
+ smp_wmb();
+
+ vcpu->arch.power_off = false;
+ kvm_vcpu_wake_up(vcpu);
return PSCI_RET_SUCCESS;
}
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
index 57b3edebbb40..204d210d01c2 100644
--- a/virt/kvm/arm/trace.h
+++ b/virt/kvm/arm/trace.h
@@ -2,6 +2,7 @@
#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KVM_H
+#include <kvm/arm_arch_timer.h>
#include <linux/tracepoint.h>
#undef TRACE_SYSTEM
@@ -26,25 +27,25 @@ TRACE_EVENT(kvm_entry,
);
TRACE_EVENT(kvm_exit,
- TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
- TP_ARGS(idx, exit_reason, vcpu_pc),
+ TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
+ TP_ARGS(ret, esr_ec, vcpu_pc),
TP_STRUCT__entry(
- __field( int, idx )
- __field( unsigned int, exit_reason )
+ __field( int, ret )
+ __field( unsigned int, esr_ec )
__field( unsigned long, vcpu_pc )
),
TP_fast_assign(
- __entry->idx = idx;
- __entry->exit_reason = exit_reason;
+ __entry->ret = ARM_EXCEPTION_CODE(ret);
+ __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
__entry->vcpu_pc = vcpu_pc;
),
TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
- __print_symbolic(__entry->idx, kvm_arm_exception_type),
- __entry->exit_reason,
- __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
+ __print_symbolic(__entry->ret, kvm_arm_exception_type),
+ __entry->esr_ec,
+ __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
__entry->vcpu_pc)
);
@@ -262,10 +263,114 @@ TRACE_EVENT(kvm_timer_update_irq,
__entry->vcpu_id, __entry->irq, __entry->level)
);
+TRACE_EVENT(kvm_get_timer_map,
+ TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
+ TP_ARGS(vcpu_id, map),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, vcpu_id )
+ __field( int, direct_vtimer )
+ __field( int, direct_ptimer )
+ __field( int, emul_ptimer )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer);
+ __entry->direct_ptimer =
+ (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
+ __entry->emul_ptimer =
+ (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
+ ),
+
+ TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d",
+ __entry->vcpu_id,
+ __entry->direct_vtimer,
+ __entry->direct_ptimer,
+ __entry->emul_ptimer)
+);
+
+TRACE_EVENT(kvm_timer_save_state,
+ TP_PROTO(struct arch_timer_context *ctx),
+ TP_ARGS(ctx),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ctl )
+ __field( unsigned long long, cval )
+ __field( int, timer_idx )
+ ),
+
+ TP_fast_assign(
+ __entry->ctl = ctx->cnt_ctl;
+ __entry->cval = ctx->cnt_cval;
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ ),
+
+ TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+ __entry->ctl,
+ __entry->cval,
+ __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_restore_state,
+ TP_PROTO(struct arch_timer_context *ctx),
+ TP_ARGS(ctx),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, ctl )
+ __field( unsigned long long, cval )
+ __field( int, timer_idx )
+ ),
+
+ TP_fast_assign(
+ __entry->ctl = ctx->cnt_ctl;
+ __entry->cval = ctx->cnt_cval;
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ ),
+
+ TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
+ __entry->ctl,
+ __entry->cval,
+ __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_hrtimer_expire,
+ TP_PROTO(struct arch_timer_context *ctx),
+ TP_ARGS(ctx),
+
+ TP_STRUCT__entry(
+ __field( int, timer_idx )
+ ),
+
+ TP_fast_assign(
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ ),
+
+ TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
+);
+
+TRACE_EVENT(kvm_timer_emulate,
+ TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
+ TP_ARGS(ctx, should_fire),
+
+ TP_STRUCT__entry(
+ __field( int, timer_idx )
+ __field( bool, should_fire )
+ ),
+
+ TP_fast_assign(
+ __entry->timer_idx = arch_timer_ctx_index(ctx);
+ __entry->should_fire = should_fire;
+ ),
+
+ TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
+ __entry->timer_idx, __entry->should_fire)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
+#define TRACE_INCLUDE_PATH ../../virt/kvm/arm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
index 07aa900bac56..1f62f2b8065d 100644
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -251,9 +251,9 @@ static int vgic_debug_show(struct seq_file *s, void *v)
return 0;
}
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
print_irq_state(s, irq, vcpu);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(kvm, irq);
return 0;
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index c0c0b88af1d5..3bdb31eaed64 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -64,7 +64,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
struct vgic_dist *dist = &kvm->arch.vgic;
INIT_LIST_HEAD(&dist->lpi_list_head);
- spin_lock_init(&dist->lpi_list_lock);
+ raw_spin_lock_init(&dist->lpi_list_lock);
}
/* CREATION */
@@ -171,7 +171,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
irq->intid = i + VGIC_NR_PRIVATE_IRQS;
INIT_LIST_HEAD(&irq->ap_list);
- spin_lock_init(&irq->irq_lock);
+ raw_spin_lock_init(&irq->irq_lock);
irq->vcpu = NULL;
irq->target_vcpu = vcpu0;
kref_init(&irq->refcount);
@@ -206,7 +206,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
vgic_cpu->sgi_iodev.base_addr = VGIC_ADDR_UNDEF;
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
- spin_lock_init(&vgic_cpu->ap_list_lock);
+ raw_spin_lock_init(&vgic_cpu->ap_list_lock);
/*
* Enable and configure all SGIs to be edge-triggered and
@@ -216,7 +216,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
INIT_LIST_HEAD(&irq->ap_list);
- spin_lock_init(&irq->irq_lock);
+ raw_spin_lock_init(&irq->irq_lock);
irq->intid = i;
irq->vcpu = NULL;
irq->target_vcpu = vcpu;
@@ -231,13 +231,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
irq->config = VGIC_CONFIG_LEVEL;
}
- /*
- * GICv3 can only be created via the KVM_DEVICE_CREATE API and
- * so we always know the emulation type at this point as it's
- * either explicitly configured as GICv3, or explicitly
- * configured as GICv2, or not configured yet which also
- * implies GICv2.
- */
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
irq->group = 1;
else
@@ -281,7 +274,7 @@ int vgic_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct kvm_vcpu *vcpu;
- int ret = 0, i;
+ int ret = 0, i, idx;
if (vgic_initialized(kvm))
return 0;
@@ -298,6 +291,19 @@ int vgic_init(struct kvm *kvm)
if (ret)
goto out;
+ /* Initialize groups on CPUs created before the VGIC type was known */
+ kvm_for_each_vcpu(idx, vcpu, kvm) {
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+ struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ irq->group = 1;
+ else
+ irq->group = 0;
+ }
+ }
+
if (vgic_has_its(kvm)) {
ret = vgic_v4_init(kvm);
if (ret)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index eb2a390a6c86..44ceaccb18cf 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -65,7 +65,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
INIT_LIST_HEAD(&irq->lpi_list);
INIT_LIST_HEAD(&irq->ap_list);
- spin_lock_init(&irq->irq_lock);
+ raw_spin_lock_init(&irq->irq_lock);
irq->config = VGIC_CONFIG_EDGE;
kref_init(&irq->refcount);
@@ -73,7 +73,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
irq->target_vcpu = vcpu;
irq->group = 1;
- spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
/*
* There could be a race with another vgic_add_lpi(), so we need to
@@ -101,7 +101,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
dist->lpi_list_count++;
out_unlock:
- spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
/*
* We "cache" the configuration table entries in our struct vgic_irq's.
@@ -287,7 +287,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
if (ret)
return ret;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
irq->priority = LPI_PROP_PRIORITY(prop);
@@ -299,7 +299,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
}
}
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
if (irq->hw)
return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);
@@ -332,7 +332,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
if (!intids)
return -ENOMEM;
- spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
if (i == irq_count)
break;
@@ -341,7 +341,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
continue;
intids[i++] = irq->intid;
}
- spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
*intid_ptr = intids;
return i;
@@ -352,9 +352,9 @@ static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
int ret = 0;
unsigned long flags;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->target_vcpu = vcpu;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
if (irq->hw) {
struct its_vlpi_map map;
@@ -455,7 +455,7 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
}
irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = pendmask & (1U << bit_nr);
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
vgic_put_irq(vcpu->kvm, irq);
@@ -612,7 +612,7 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
return irq_set_irqchip_state(irq->host_irq,
IRQCHIP_STATE_PENDING, true);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
vgic_queue_irq_unlock(kvm, irq, flags);
@@ -754,8 +754,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
int esz = GITS_BASER_ENTRY_SIZE(baser);
- int index;
+ int index, idx;
gfn_t gfn;
+ bool ret;
switch (type) {
case GITS_BASER_TYPE_DEVICE:
@@ -782,7 +783,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
if (eaddr)
*eaddr = addr;
- return kvm_is_visible_gfn(its->dev->kvm, gfn);
+
+ goto out;
}
/* calculate and check the index into the 1st level */
@@ -812,7 +814,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
if (eaddr)
*eaddr = indirect_ptr;
- return kvm_is_visible_gfn(its->dev->kvm, gfn);
+
+out:
+ idx = srcu_read_lock(&its->dev->kvm->srcu);
+ ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
+ srcu_read_unlock(&its->dev->kvm->srcu, idx);
+ return ret;
}
static int vgic_its_alloc_collection(struct vgic_its *its,
@@ -1729,8 +1736,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
kfree(its);
}
-int vgic_its_has_attr_regs(struct kvm_device *dev,
- struct kvm_device_attr *attr)
+static int vgic_its_has_attr_regs(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
{
const struct vgic_register_region *region;
gpa_t offset = attr->attr;
@@ -1750,9 +1757,9 @@ int vgic_its_has_attr_regs(struct kvm_device *dev,
return 0;
}
-int vgic_its_attr_regs_access(struct kvm_device *dev,
- struct kvm_device_attr *attr,
- u64 *reg, bool is_write)
+static int vgic_its_attr_regs_access(struct kvm_device *dev,
+ struct kvm_device_attr *attr,
+ u64 *reg, bool is_write)
{
const struct vgic_register_region *region;
struct vgic_its *its;
@@ -1919,7 +1926,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
ite->collection->collection_id;
val = cpu_to_le64(val);
- return kvm_write_guest(kvm, gpa, &val, ite_esz);
+ return kvm_write_guest_lock(kvm, gpa, &val, ite_esz);
}
/**
@@ -2066,7 +2073,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
(itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
(dev->num_eventid_bits - 1));
val = cpu_to_le64(val);
- return kvm_write_guest(kvm, ptr, &val, dte_esz);
+ return kvm_write_guest_lock(kvm, ptr, &val, dte_esz);
}
/**
@@ -2246,7 +2253,7 @@ static int vgic_its_save_cte(struct vgic_its *its,
((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
collection->collection_id);
val = cpu_to_le64(val);
- return kvm_write_guest(its->dev->kvm, gpa, &val, esz);
+ return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz);
}
static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz)
@@ -2317,7 +2324,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its)
*/
val = 0;
BUG_ON(cte_esz > sizeof(val));
- ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz);
+ ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz);
return ret;
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 738b65d2d0e7..b535fffc7400 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -147,7 +147,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->pending_latch = true;
irq->source |= 1U << source_vcpu->vcpu_id;
@@ -191,13 +191,13 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i);
int target;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->targets = (val >> (i * 8)) & cpu_mask;
target = irq->targets ? __ffs(irq->targets) : 0;
irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -230,13 +230,13 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
for (i = 0; i < len; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->source &= ~((val >> (i * 8)) & 0xff);
if (!irq->source)
irq->pending_latch = false;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -252,7 +252,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
for (i = 0; i < len; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->source |= (val >> (i * 8)) & 0xff;
@@ -260,7 +260,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
irq->pending_latch = true;
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
} else {
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}
vgic_put_irq(vcpu->kvm, irq);
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index b3d1f0985117..9f4843fe9cda 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -169,13 +169,13 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
if (!irq)
return;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
/* We only care about and preserve Aff0, Aff1 and Aff2. */
irq->mpidr = val & GENMASK(23, 0);
irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -200,6 +200,9 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+ if (was_enabled && !vgic_cpu->lpis_enabled)
+ vgic_flush_pending_lpis(vcpu);
+
if (!was_enabled && vgic_cpu->lpis_enabled)
vgic_enable_lpis(vcpu);
}
@@ -281,7 +284,7 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
for (i = 0; i < len * 8; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (test_bit(i, &val)) {
/*
* pending_latch is set irrespective of irq type
@@ -292,7 +295,7 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
} else {
irq->pending_latch = false;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}
vgic_put_irq(vcpu->kvm, irq);
@@ -957,7 +960,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
/*
* An access targetting Group0 SGIs can only generate
@@ -968,7 +971,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
irq->pending_latch = true;
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
} else {
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}
vgic_put_irq(vcpu->kvm, irq);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index f56ff1cf52ec..7de42fba05b5 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -77,7 +77,7 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
for (i = 0; i < len * 8; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->group = !!(val & BIT(i));
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
@@ -120,7 +120,7 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->enabled = true;
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
@@ -139,11 +139,11 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->enabled = false;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -160,10 +160,10 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
unsigned long flags;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq_is_pending(irq))
value |= (1U << i);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -215,7 +215,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw)
vgic_hw_irq_spending(vcpu, irq, is_uaccess);
else
@@ -262,14 +262,14 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw)
vgic_hw_irq_cpending(vcpu, irq, is_uaccess);
else
irq->pending_latch = false;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -311,44 +311,38 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
unsigned long flags;
struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu();
- spin_lock_irqsave(&irq->irq_lock, flags);
-
- /*
- * If this virtual IRQ was written into a list register, we
- * have to make sure the CPU that runs the VCPU thread has
- * synced back the LR state to the struct vgic_irq.
- *
- * As long as the conditions below are true, we know the VCPU thread
- * may be on its way back from the guest (we kicked the VCPU thread in
- * vgic_change_active_prepare) and still has to sync back this IRQ,
- * so we release and re-acquire the spin_lock to let the other thread
- * sync back the IRQ.
- *
- * When accessing VGIC state from user space, requester_vcpu is
- * NULL, which is fine, because we guarantee that no VCPUs are running
- * when accessing VGIC state from user space so irq->vcpu->cpu is
- * always -1.
- */
- while (irq->vcpu && /* IRQ may have state in an LR somewhere */
- irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */
- irq->vcpu->cpu != -1) /* VCPU thread is running */
- cond_resched_lock(&irq->irq_lock);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->hw) {
vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
} else {
u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u8 active_source;
irq->active = active;
+
+ /*
+ * The GICv2 architecture indicates that the source CPUID for
+ * an SGI should be provided during an EOI which implies that
+ * the active state is stored somewhere, but at the same time
+ * this state is not architecturally exposed anywhere and we
+ * have no way of knowing the right source.
+ *
+ * This may lead to a VCPU not being able to receive
+ * additional instances of a particular SGI after migration
+ * for a GICv2 VM on some GIC implementations. Oh well.
+ */
+ active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
+
if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
active && vgic_irq_is_sgi(irq->intid))
- irq->active_source = requester_vcpu->vcpu_id;
+ irq->active_source = active_source;
}
if (irq->active)
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
else
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}
/*
@@ -368,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
*/
static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
{
- if (intid > VGIC_NR_PRIVATE_IRQS)
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid > VGIC_NR_PRIVATE_IRQS)
kvm_arm_halt_guest(vcpu->kvm);
}
/* See vgic_change_active_prepare */
static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
{
- if (intid > VGIC_NR_PRIVATE_IRQS)
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid > VGIC_NR_PRIVATE_IRQS)
kvm_arm_resume_guest(vcpu->kvm);
}
@@ -489,10 +485,10 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
for (i = 0; i < len; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
/* Narrow the priority range to what we actually support */
irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -538,14 +534,14 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
continue;
irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (test_bit(i * 2 + 1, &val))
irq->config = VGIC_CONFIG_EDGE;
else
irq->config = VGIC_CONFIG_LEVEL;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -594,12 +590,12 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
* restore irq config before line level.
*/
new_level = !!(val & (1U << i));
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->line_level = new_level;
if (new_level)
vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
else
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
}
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 69b892abd7dc..d91a8938aa7c 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -84,7 +84,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
- spin_lock(&irq->irq_lock);
+ raw_spin_lock(&irq->irq_lock);
/* Always preserve the active bit */
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
@@ -127,7 +127,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
vgic_irq_set_phys_active(irq, false);
}
- spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
}
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 9c0dd234ebe8..9f87e58dbd4a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -76,7 +76,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
if (!irq) /* An LPI could have been unmapped. */
continue;
- spin_lock(&irq->irq_lock);
+ raw_spin_lock(&irq->irq_lock);
/* Always preserve the active bit */
irq->active = !!(val & ICH_LR_ACTIVE_BIT);
@@ -119,7 +119,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
vgic_irq_set_phys_active(irq, false);
}
- spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
vgic_put_irq(vcpu->kvm, irq);
}
@@ -347,9 +347,9 @@ retry:
status = val & (1 << bit_nr);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->target_vcpu != vcpu) {
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
goto retry;
}
irq->pending_latch = status;
@@ -358,7 +358,7 @@ retry:
if (status) {
/* clear consumed data */
val &= ~(1 << bit_nr);
- ret = kvm_write_guest(kvm, ptr, &val, 1);
+ ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
}
@@ -409,7 +409,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm)
else
val &= ~(1 << bit_nr);
- ret = kvm_write_guest(kvm, ptr, &val, 1);
+ ret = kvm_write_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
}
@@ -589,7 +589,7 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
*/
int vgic_v3_probe(const struct gic_kvm_info *info)
{
- u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+ u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2);
int ret;
/*
@@ -679,7 +679,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
if (likely(cpu_if->vgic_sre))
- cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr);
+ cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
kvm_call_hyp(__vgic_v3_save_aprs, vcpu);
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 7cfdfbc910e0..191deccf60bf 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -54,11 +54,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* When taking more than one ap_list_lock at the same time, always take the
* lowest numbered VCPU's ap_list_lock first, so:
* vcpuX->vcpu_id < vcpuY->vcpu_id:
- * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
- * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
+ * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
+ * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
*
* Since the VGIC must support injecting virtual interrupts from ISRs, we have
- * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer
+ * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
* spinlocks for any lock that may be taken while injecting an interrupt.
*/
@@ -72,7 +72,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
struct vgic_irq *irq = NULL;
unsigned long flags;
- spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
if (irq->intid != intid)
@@ -88,7 +88,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
irq = NULL;
out_unlock:
- spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
return irq;
}
@@ -103,13 +103,13 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
{
/* SGIs and PPIs */
if (intid <= VGIC_MAX_PRIVATE) {
- intid = array_index_nospec(intid, VGIC_MAX_PRIVATE);
+ intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
return &vcpu->arch.vgic_cpu.private_irqs[intid];
}
/* SPIs */
- if (intid <= VGIC_MAX_SPI) {
- intid = array_index_nospec(intid, VGIC_MAX_SPI);
+ if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+ intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
}
@@ -138,19 +138,40 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
if (irq->intid < VGIC_MIN_LPI)
return;
- spin_lock_irqsave(&dist->lpi_list_lock, flags);
+ raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
if (!kref_put(&irq->refcount, vgic_irq_release)) {
- spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
return;
};
list_del(&irq->lpi_list);
dist->lpi_list_count--;
- spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+ raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
kfree(irq);
}
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_irq *irq, *tmp;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+
+ list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
+ if (irq->intid >= VGIC_MIN_LPI) {
+ raw_spin_lock(&irq->irq_lock);
+ list_del(&irq->ap_list);
+ irq->vcpu = NULL;
+ raw_spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+}
+
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
{
WARN_ON(irq_set_irqchip_state(irq->host_irq,
@@ -196,7 +217,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
*/
static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
{
- DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+ lockdep_assert_held(&irq->irq_lock);
/* If the interrupt is active, it must stay on the current vcpu */
if (irq->active)
@@ -244,8 +265,8 @@ static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
bool penda, pendb;
int ret;
- spin_lock(&irqa->irq_lock);
- spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&irqa->irq_lock);
+ raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
if (irqa->active || irqb->active) {
ret = (int)irqb->active - (int)irqa->active;
@@ -263,8 +284,8 @@ static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b)
/* Both pending and enabled, sort by priority */
ret = irqa->priority - irqb->priority;
out:
- spin_unlock(&irqb->irq_lock);
- spin_unlock(&irqa->irq_lock);
+ raw_spin_unlock(&irqb->irq_lock);
+ raw_spin_unlock(&irqa->irq_lock);
return ret;
}
@@ -273,7 +294,7 @@ static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+ lockdep_assert_held(&vgic_cpu->ap_list_lock);
list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
}
@@ -311,7 +332,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
{
struct kvm_vcpu *vcpu;
- DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+ lockdep_assert_held(&irq->irq_lock);
retry:
vcpu = vgic_target_oracle(irq);
@@ -325,7 +346,7 @@ retry:
* not need to be inserted into an ap_list and there is also
* no more work for us to do.
*/
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
/*
* We have to kick the VCPU here, because we could be
@@ -347,12 +368,12 @@ retry:
* We must unlock the irq lock to take the ap_list_lock where
* we are going to insert this new pending interrupt.
*/
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
/* someone can do stuff here, which we re-check below */
- spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
- spin_lock(&irq->irq_lock);
+ raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+ raw_spin_lock(&irq->irq_lock);
/*
* Did something change behind our backs?
@@ -367,10 +388,11 @@ retry:
*/
if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
- spin_unlock(&irq->irq_lock);
- spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
+ flags);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
goto retry;
}
@@ -382,8 +404,8 @@ retry:
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
- spin_unlock(&irq->irq_lock);
- spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
kvm_vcpu_kick(vcpu);
@@ -430,11 +452,11 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
if (!irq)
return -EINVAL;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (!vgic_validate_injection(irq, level, owner)) {
/* Nothing to see here, move along... */
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(kvm, irq);
return 0;
}
@@ -494,9 +516,9 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
BUG_ON(!irq);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
return ret;
@@ -519,11 +541,11 @@ void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
if (!irq->hw)
goto out;
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
irq->active = false;
irq->pending_latch = false;
irq->line_level = false;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
out:
vgic_put_irq(vcpu->kvm, irq);
}
@@ -539,9 +561,9 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
BUG_ON(!irq);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
kvm_vgic_unmap_irq(irq);
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
return 0;
@@ -571,12 +593,12 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
return -EINVAL;
irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
if (irq->owner && irq->owner != owner)
ret = -EEXIST;
else
irq->owner = owner;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
return ret;
}
@@ -597,13 +619,13 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
retry:
- spin_lock(&vgic_cpu->ap_list_lock);
+ raw_spin_lock(&vgic_cpu->ap_list_lock);
list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
bool target_vcpu_needs_kick = false;
- spin_lock(&irq->irq_lock);
+ raw_spin_lock(&irq->irq_lock);
BUG_ON(vcpu != irq->vcpu);
@@ -616,7 +638,7 @@ retry:
*/
list_del(&irq->ap_list);
irq->vcpu = NULL;
- spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
/*
* This vgic_put_irq call matches the
@@ -631,14 +653,14 @@ retry:
if (target_vcpu == vcpu) {
/* We're on the right CPU */
- spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
continue;
}
/* This interrupt looks like it has to be migrated. */
- spin_unlock(&irq->irq_lock);
- spin_unlock(&vgic_cpu->ap_list_lock);
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&vgic_cpu->ap_list_lock);
/*
* Ensure locking order by always locking the smallest
@@ -652,10 +674,10 @@ retry:
vcpuB = vcpu;
}
- spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
- spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
- SINGLE_DEPTH_NESTING);
- spin_lock(&irq->irq_lock);
+ raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+ raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
+ SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&irq->irq_lock);
/*
* If the affinity has been preserved, move the
@@ -675,9 +697,9 @@ retry:
target_vcpu_needs_kick = true;
}
- spin_unlock(&irq->irq_lock);
- spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
- spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
+ raw_spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
+ raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);
if (target_vcpu_needs_kick) {
kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
@@ -687,7 +709,7 @@ retry:
goto retry;
}
- spin_unlock(&vgic_cpu->ap_list_lock);
+ raw_spin_unlock(&vgic_cpu->ap_list_lock);
}
static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
@@ -702,7 +724,7 @@ static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
struct vgic_irq *irq, int lr)
{
- DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock));
+ lockdep_assert_held(&irq->irq_lock);
if (kvm_vgic_global_state.type == VGIC_V2)
vgic_v2_populate_lr(vcpu, irq, lr);
@@ -736,15 +758,15 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
*multi_sgi = false;
- DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+ lockdep_assert_held(&vgic_cpu->ap_list_lock);
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
int w;
- spin_lock(&irq->irq_lock);
+ raw_spin_lock(&irq->irq_lock);
/* GICv2 SGIs can count for more than one... */
w = vgic_irq_get_lr_count(irq);
- spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
count += w;
*multi_sgi |= (w > 1);
@@ -761,7 +783,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
bool multi_sgi;
u8 prio = 0xff;
- DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock));
+ lockdep_assert_held(&vgic_cpu->ap_list_lock);
count = compute_ap_list_depth(vcpu, &multi_sgi);
if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
@@ -770,7 +792,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
count = 0;
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- spin_lock(&irq->irq_lock);
+ raw_spin_lock(&irq->irq_lock);
/*
* If we have multi-SGIs in the pipeline, we need to
@@ -780,7 +802,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
* the AP list has been sorted already.
*/
if (multi_sgi && irq->priority > prio) {
- spin_unlock(&irq->irq_lock);
+ _raw_spin_unlock(&irq->irq_lock);
break;
}
@@ -791,7 +813,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
prio = irq->priority;
}
- spin_unlock(&irq->irq_lock);
+ raw_spin_unlock(&irq->irq_lock);
if (count == kvm_vgic_global_state.nr_lr) {
if (!list_is_last(&irq->ap_list,
@@ -866,15 +888,21 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
* either observe the new interrupt before or after doing this check,
* and introducing additional synchronization mechanism doesn't change
* this.
+ *
+ * Note that we still need to go through the whole thing if anything
+ * can be directly injected (GICv4).
*/
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
+ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
+ !vgic_supports_direct_msis(vcpu->kvm))
return;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
- vgic_flush_lr_state(vcpu);
- spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
+ raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ vgic_flush_lr_state(vcpu);
+ raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ }
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
@@ -908,6 +936,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
struct vgic_irq *irq;
bool pending = false;
unsigned long flags;
+ struct vgic_vmcr vmcr;
if (!vcpu->kvm->arch.vgic.enabled)
return false;
@@ -915,18 +944,22 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
return true;
- spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
+ vgic_get_vmcr(vcpu, &vmcr);
+
+ raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- spin_lock(&irq->irq_lock);
- pending = irq_is_pending(irq) && irq->enabled;
- spin_unlock(&irq->irq_lock);
+ raw_spin_lock(&irq->irq_lock);
+ pending = irq_is_pending(irq) && irq->enabled &&
+ !irq->active &&
+ irq->priority < vmcr.pmr;
+ raw_spin_unlock(&irq->irq_lock);
if (pending)
break;
}
- spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
+ raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
return pending;
}
@@ -958,11 +991,10 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
return false;
irq = vgic_get_irq(vcpu->kvm, vcpu, vintid);
- spin_lock_irqsave(&irq->irq_lock, flags);
+ raw_spin_lock_irqsave(&irq->irq_lock, flags);
map_is_active = irq->hw && irq->active;
- spin_unlock_irqrestore(&irq->irq_lock, flags);
+ raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
vgic_put_irq(vcpu->kvm, irq);
return map_is_active;
}
-
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index a90024718ca4..abeeffabc456 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -238,6 +238,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu);
bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 23c2519c5b32..110cbe3f74f8 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -82,7 +82,7 @@ static void async_pf_execute(struct work_struct *work)
might_sleep();
/*
- * This work is run asynchromously to the task which owns
+ * This work is run asynchronously to the task which owns
* mm and might be done in another context, so we must
* access remotely.
*/
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 6855cce3e528..5294abb3f178 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -144,7 +144,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
if (zone->pio != 1 && zone->pio != 0)
return -EINVAL;
- dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+ dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev),
+ GFP_KERNEL_ACCOUNT);
if (!dev)
return -ENOMEM;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b20b751286fc..001aeda4c154 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -214,9 +214,9 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
if (flags & EPOLLHUP) {
/* The eventfd is closing, detach from KVM */
- unsigned long flags;
+ unsigned long iflags;
- spin_lock_irqsave(&kvm->irqfds.lock, flags);
+ spin_lock_irqsave(&kvm->irqfds.lock, iflags);
/*
* We must check if someone deactivated the irqfd before
@@ -230,7 +230,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
if (irqfd_is_active(irqfd))
irqfd_deactivate(irqfd);
- spin_unlock_irqrestore(&kvm->irqfds.lock, flags);
+ spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
}
return 0;
@@ -297,7 +297,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
if (!kvm_arch_intc_initialized(kvm))
return -EAGAIN;
- irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+ irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
if (!irqfd)
return -ENOMEM;
@@ -345,7 +345,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
}
if (!irqfd->resampler) {
- resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+ resampler = kzalloc(sizeof(*resampler),
+ GFP_KERNEL_ACCOUNT);
if (!resampler) {
ret = -ENOMEM;
mutex_unlock(&kvm->irqfds.resampler_lock);
@@ -797,7 +798,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
if (!p) {
ret = -ENOMEM;
goto fail;
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index b1286c4e0712..79e59e4fa3dc 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -144,18 +144,19 @@ static int setup_routing_entry(struct kvm *kvm,
{
struct kvm_kernel_irq_routing_entry *ei;
int r;
+ u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES);
/*
* Do not allow GSI to be mapped to the same irqchip more than once.
* Allow only one to one mapping between GSI and non-irqchip routing.
*/
- hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
+ hlist_for_each_entry(ei, &rt->map[gsi], link)
if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
ue->u.irqchip.irqchip == ei->irqchip.irqchip)
return -EINVAL;
- e->gsi = ue->gsi;
+ e->gsi = gsi;
e->type = ue->type;
r = kvm_set_routing_entry(kvm, e, ue);
if (r)
@@ -196,7 +197,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
nr_rt_entries += 1;
new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!new)
return -ENOMEM;
@@ -208,7 +209,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
for (i = 0; i < nr; ++i) {
r = -ENOMEM;
- e = kzalloc(sizeof(*e), GFP_KERNEL);
+ e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT);
if (!e)
goto out;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2679e476b6c3..a704d1f9bd96 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -81,6 +81,11 @@ unsigned int halt_poll_ns_grow = 2;
module_param(halt_poll_ns_grow, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
+/* The start value to grow halt_poll_ns from */
+unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
+module_param(halt_poll_ns_grow_start, uint, 0644);
+EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
+
/* Default resets per-vcpu halt_poll_ns . */
unsigned int halt_poll_ns_shrink;
module_param(halt_poll_ns_shrink, uint, 0644);
@@ -354,16 +359,16 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
- kvm_set_spte_hva(kvm, address, pte);
+
+ if (kvm_set_spte_hva(kvm, address, pte))
+ kvm_flush_remote_tlbs(kvm);
+
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx;
@@ -377,7 +382,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
@@ -385,7 +390,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
spin_unlock(&kvm->mmu_lock);
- ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable);
+ ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
+ range->end, range->blockable);
srcu_read_unlock(&kvm->srcu, idx);
@@ -393,9 +399,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct mmu_notifier_range *range)
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
@@ -526,7 +530,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
int i;
struct kvm_memslots *slots;
- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
if (!slots)
return NULL;
@@ -602,12 +606,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
sizeof(*kvm->debugfs_stat_data),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!kvm->debugfs_stat_data)
return -ENOMEM;
for (p = debugfs_entries; p->name; p++) {
- stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+ stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
if (!stat_data)
return -ENOMEM;
@@ -657,12 +661,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
struct kvm_memslots *slots = kvm_alloc_memslots();
if (!slots)
goto out_err_no_srcu;
- /*
- * Generations must be different for each address space.
- * Init kvm generation close to the maximum to easily test the
- * code of handling generation number wrap-around.
- */
- slots->generation = i * 2 - 150;
+ /* Generations must be different for each address space. */
+ slots->generation = i;
rcu_assign_pointer(kvm->memslots[i], slots);
}
@@ -672,7 +672,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
goto out_err_no_irq_srcu;
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
- kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
+ kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
if (!kvm->buses[i])
goto out_err;
}
@@ -790,7 +790,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
{
unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
- memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
+ memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
if (!memslot->dirty_bitmap)
return -ENOMEM;
@@ -875,31 +875,34 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
int as_id, struct kvm_memslots *slots)
{
struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+ u64 gen = old_memslots->generation;
- /*
- * Set the low bit in the generation, which disables SPTE caching
- * until the end of synchronize_srcu_expedited.
- */
- WARN_ON(old_memslots->generation & 1);
- slots->generation = old_memslots->generation + 1;
+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+ slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
rcu_assign_pointer(kvm->memslots[as_id], slots);
synchronize_srcu_expedited(&kvm->srcu);
/*
- * Increment the new memslot generation a second time. This prevents
- * vm exits that race with memslot updates from caching a memslot
- * generation that will (potentially) be valid forever.
- *
+ * Increment the new memslot generation a second time, dropping the
+ * update in-progress flag and incrementing then generation based on
+ * the number of address spaces. This provides a unique and easily
+ * identifiable generation number while the memslots are in flux.
+ */
+ gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
+
+ /*
* Generations must be unique even across address spaces. We do not need
* a global counter for that, instead the generation space is evenly split
* across address spaces. For example, with two address spaces, address
- * space 0 will use generations 0, 4, 8, ... while * address space 1 will
- * use generations 2, 6, 10, 14, ...
+ * space 0 will use generations 0, 2, 4, ... while address space 1 will
+ * use generations 1, 3, 5, ...
*/
- slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
+ gen += KVM_ADDRESS_SPACE_NUM;
- kvm_arch_memslots_updated(kvm, slots);
+ kvm_arch_memslots_updated(kvm, gen);
+
+ slots->generation = gen;
return old_memslots;
}
@@ -940,8 +943,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
/* We can read the guest memory with __xxx_user() later on. */
if ((id < KVM_USER_MEM_SLOTS) &&
((mem->userspace_addr & (PAGE_SIZE - 1)) ||
- !access_ok(VERIFY_WRITE,
- (void __user *)(unsigned long)mem->userspace_addr,
+ !access_ok((void __user *)(unsigned long)mem->userspace_addr,
mem->memory_size)))
goto out;
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
@@ -1020,7 +1022,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
goto out_free;
}
- slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
if (!slots)
goto out_free;
memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
@@ -1133,7 +1135,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
/**
* kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
- * are dirty write protect them for next write.
+ * and reenable dirty page tracking for the corresponding pages.
* @kvm: pointer to kvm instance
* @log: slot id and address to which we copy the log
* @is_dirty: flag set if any page is dirty
@@ -1154,7 +1156,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
*
*/
int kvm_get_dirty_log_protect(struct kvm *kvm,
- struct kvm_dirty_log *log, bool *is_dirty)
+ struct kvm_dirty_log *log, bool *flush)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
@@ -1176,37 +1178,118 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
return -ENOENT;
n = kvm_dirty_bitmap_bytes(memslot);
+ *flush = false;
+ if (kvm->manual_dirty_log_protect) {
+ /*
+ * Unlike kvm_get_dirty_log, we always return false in *flush,
+ * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
+ * is some code duplication between this function and
+ * kvm_get_dirty_log, but hopefully all architecture
+ * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
+ * can be eliminated.
+ */
+ dirty_bitmap_buffer = dirty_bitmap;
+ } else {
+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
+ memset(dirty_bitmap_buffer, 0, n);
- dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
- memset(dirty_bitmap_buffer, 0, n);
-
- spin_lock(&kvm->mmu_lock);
- *is_dirty = false;
- for (i = 0; i < n / sizeof(long); i++) {
- unsigned long mask;
- gfn_t offset;
-
- if (!dirty_bitmap[i])
- continue;
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < n / sizeof(long); i++) {
+ unsigned long mask;
+ gfn_t offset;
- *is_dirty = true;
+ if (!dirty_bitmap[i])
+ continue;
- mask = xchg(&dirty_bitmap[i], 0);
- dirty_bitmap_buffer[i] = mask;
+ *flush = true;
+ mask = xchg(&dirty_bitmap[i], 0);
+ dirty_bitmap_buffer[i] = mask;
- if (mask) {
offset = i * BITS_PER_LONG;
kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
offset, mask);
}
+ spin_unlock(&kvm->mmu_lock);
}
- spin_unlock(&kvm->mmu_lock);
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
+
+/**
+ * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
+ * and reenable dirty page tracking for the corresponding pages.
+ * @kvm: pointer to kvm instance
+ * @log: slot id and address from which to fetch the bitmap of dirty pages
+ */
+int kvm_clear_dirty_log_protect(struct kvm *kvm,
+ struct kvm_clear_dirty_log *log, bool *flush)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int as_id, id;
+ gfn_t offset;
+ unsigned long i, n;
+ unsigned long *dirty_bitmap;
+ unsigned long *dirty_bitmap_buffer;
+
+ as_id = log->slot >> 16;
+ id = (u16)log->slot;
+ if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+ return -EINVAL;
+
+ if (log->first_page & 63)
+ return -EINVAL;
+
+ slots = __kvm_memslots(kvm, as_id);
+ memslot = id_to_memslot(slots, id);
+
+ dirty_bitmap = memslot->dirty_bitmap;
+ if (!dirty_bitmap)
+ return -ENOENT;
+
+ n = kvm_dirty_bitmap_bytes(memslot);
+
+ if (log->first_page > memslot->npages ||
+ log->num_pages > memslot->npages - log->first_page ||
+ (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
+ return -EINVAL;
+
+ *flush = false;
+ dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
+ if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
+ return -EFAULT;
+
+ spin_lock(&kvm->mmu_lock);
+ for (offset = log->first_page,
+ i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--;
+ i++, offset += BITS_PER_LONG) {
+ unsigned long mask = *dirty_bitmap_buffer++;
+ atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
+ if (!mask)
+ continue;
+
+ mask &= atomic_long_fetch_andnot(mask, p);
+
+ /*
+ * mask contains the bits that really have been cleared. This
+ * never includes any bits beyond the length of the memslot (if
+ * the length is not aligned to 64 pages), therefore it is not
+ * a problem if userspace sets them in log->dirty_bitmap.
+ */
+ if (mask) {
+ *flush = true;
+ kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
+ offset, mask);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect);
#endif
bool kvm_largepages_enabled(void)
@@ -1928,32 +2011,33 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
gfn_t nr_pages_avail;
+ int r = start_gfn <= end_gfn ? 0 : -EINVAL;
ghc->gpa = gpa;
ghc->generation = slots->generation;
ghc->len = len;
- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
- if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
+ ghc->hva = KVM_HVA_ERR_BAD;
+
+ /*
+ * If the requested region crosses two memslots, we still
+ * verify that the entire region is valid here.
+ */
+ while (!r && start_gfn <= end_gfn) {
+ ghc->memslot = __gfn_to_memslot(slots, start_gfn);
+ ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
+ &nr_pages_avail);
+ if (kvm_is_error_hva(ghc->hva))
+ r = -EFAULT;
+ start_gfn += nr_pages_avail;
+ }
+
+ /* Use the slow path for cross page reads and writes. */
+ if (!r && nr_pages_needed == 1)
ghc->hva += offset;
- } else {
- /*
- * If the requested region crosses two memslots, we still
- * verify that the entire region is valid here.
- */
- while (start_gfn <= end_gfn) {
- nr_pages_avail = 0;
- ghc->memslot = __gfn_to_memslot(slots, start_gfn);
- ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
- &nr_pages_avail);
- if (kvm_is_error_hva(ghc->hva))
- return -EFAULT;
- start_gfn += nr_pages_avail;
- }
- /* Use the slow path for cross page reads and writes. */
+ else
ghc->memslot = NULL;
- }
- return 0;
+
+ return r;
}
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
@@ -1965,7 +2049,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
- void *data, int offset, unsigned long len)
+ void *data, unsigned int offset,
+ unsigned long len)
{
struct kvm_memslots *slots = kvm_memslots(kvm);
int r;
@@ -2103,20 +2188,23 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
{
- unsigned int old, val, grow;
+ unsigned int old, val, grow, grow_start;
old = val = vcpu->halt_poll_ns;
+ grow_start = READ_ONCE(halt_poll_ns_grow_start);
grow = READ_ONCE(halt_poll_ns_grow);
- /* 10us base */
- if (val == 0 && grow)
- val = 10000;
- else
- val *= grow;
+ if (!grow)
+ goto out;
+
+ val *= grow;
+ if (val < grow_start)
+ val = grow_start;
if (val > halt_poll_ns)
val = halt_poll_ns;
vcpu->halt_poll_ns = val;
+out:
trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
}
@@ -2601,7 +2689,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
struct kvm_regs *kvm_regs;
r = -ENOMEM;
- kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
+ kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
if (!kvm_regs)
goto out;
r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
@@ -2629,7 +2717,8 @@ out_free1:
break;
}
case KVM_GET_SREGS: {
- kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
+ kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
+ GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!kvm_sregs)
goto out;
@@ -2721,7 +2810,7 @@ out_free1:
break;
}
case KVM_GET_FPU: {
- fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
+ fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!fpu)
goto out;
@@ -2817,6 +2906,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
{
struct kvm_device *dev = filp->private_data;
+ if (dev->kvm->mm != current->mm)
+ return -EIO;
+
switch (ioctl) {
case KVM_SET_DEVICE_ATTR:
return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
@@ -2886,19 +2978,21 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
struct kvm_device_ops *ops = NULL;
struct kvm_device *dev;
bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+ int type;
int ret;
if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
return -ENODEV;
- ops = kvm_device_ops_table[cd->type];
+ type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
+ ops = kvm_device_ops_table[type];
if (ops == NULL)
return -ENODEV;
if (test)
return 0;
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
if (!dev)
return -ENOMEM;
@@ -2906,7 +3000,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
dev->kvm = kvm;
mutex_lock(&kvm->lock);
- ret = ops->create(dev, cd->type);
+ ret = ops->create(dev, type);
if (ret < 0) {
mutex_unlock(&kvm->lock);
kfree(dev);
@@ -2918,8 +3012,10 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
if (ops->init)
ops->init(dev);
+ kvm_get_kvm(kvm);
ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
if (ret < 0) {
+ kvm_put_kvm(kvm);
mutex_lock(&kvm->lock);
list_del(&dev->vm_node);
mutex_unlock(&kvm->lock);
@@ -2927,7 +3023,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
return ret;
}
- kvm_get_kvm(kvm);
cd->fd = ret;
return 0;
}
@@ -2948,6 +3043,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
case KVM_CAP_CHECK_EXTENSION_VM:
+ case KVM_CAP_ENABLE_CAP_VM:
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
+#endif
return 1;
#ifdef CONFIG_KVM_MMIO
case KVM_CAP_COALESCED_MMIO:
@@ -2971,6 +3070,28 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
return kvm_vm_ioctl_check_extension(kvm, arg);
}
+int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ return -EINVAL;
+}
+
+static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ switch (cap->cap) {
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT:
+ if (cap->flags || (cap->args[0] & ~1))
+ return -EINVAL;
+ kvm->manual_dirty_log_protect = cap->args[0];
+ return 0;
+#endif
+ default:
+ return kvm_vm_ioctl_enable_cap(kvm, cap);
+ }
+}
+
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2984,6 +3105,15 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
break;
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
+ break;
+ }
case KVM_SET_USER_MEMORY_REGION: {
struct kvm_userspace_memory_region kvm_userspace_mem;
@@ -3004,6 +3134,17 @@ static long kvm_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
break;
}
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ case KVM_CLEAR_DIRTY_LOG: {
+ struct kvm_clear_dirty_log log;
+
+ r = -EFAULT;
+ if (copy_from_user(&log, argp, sizeof(log)))
+ goto out;
+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
+ break;
+ }
+#endif
#ifdef CONFIG_KVM_MMIO
case KVM_REGISTER_COALESCED_MMIO: {
struct kvm_coalesced_mmio_zone zone;
@@ -3496,6 +3637,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
r = __kvm_io_bus_write(vcpu, bus, &range, val);
return r < 0 ? r : 0;
}
+EXPORT_SYMBOL_GPL(kvm_io_bus_write);
/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
@@ -3546,7 +3688,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
return -EOPNOTSUPP;
}
-EXPORT_SYMBOL_GPL(kvm_io_bus_write);
/* kvm_io_bus_read - called under kvm->slots_lock */
int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
@@ -3568,7 +3709,6 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
return r < 0 ? r : 0;
}
-
/* Caller must hold slots_lock. */
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, struct kvm_io_device *dev)
@@ -3585,8 +3725,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
return -ENOSPC;
- new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
- sizeof(struct kvm_io_range)), GFP_KERNEL);
+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
+ GFP_KERNEL_ACCOUNT);
if (!new_bus)
return -ENOMEM;
@@ -3631,8 +3771,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
if (i == bus->dev_count)
return;
- new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
- sizeof(struct kvm_io_range)), GFP_KERNEL);
+ new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
+ GFP_KERNEL_ACCOUNT);
if (!new_bus) {
pr_err("kvm: failed to shrink bus, removing it completely\n");
goto broken;
@@ -3900,7 +4040,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
active = kvm_active_vms;
spin_unlock(&kvm_lock);
- env = kzalloc(sizeof(*env), GFP_KERNEL);
+ env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
if (!env)
return;
@@ -3915,8 +4055,8 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
}
add_uevent_var(env, "PID=%d", kvm->userspace_pid);
- if (kvm->debugfs_dentry) {
- char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
+ char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
if (p) {
tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
@@ -3955,7 +4095,7 @@ static int kvm_suspend(void)
static void kvm_resume(void)
{
if (kvm_usage_count) {
- WARN_ON(raw_spin_is_locked(&kvm_count_lock));
+ lockdep_assert_held(&kvm_count_lock);
hardware_enable_nolock(NULL);
}
}
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d99850c462a1..524cbd20379f 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -219,7 +219,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
}
}
- kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
+ kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT);
if (!kvg) {
mutex_unlock(&kv->lock);
kvm_vfio_group_put_external_user(vfio_group);
@@ -405,7 +405,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
if (tmp->ops == &kvm_vfio_ops)
return -EBUSY;
- kv = kzalloc(sizeof(*kv), GFP_KERNEL);
+ kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT);
if (!kv)
return -ENOMEM;