diff options
| author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2019-05-10 11:43:46 -0700 |
|---|---|---|
| committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2019-05-10 11:43:46 -0700 |
| commit | 2a267e7c41aa88215de2b542de797d03d16ecdfd (patch) | |
| tree | b949270835e304c8b60a40cde1b2c2e19c13b33a /virt | |
| parent | 0981949da8f7498b96c6e0ae60680865ca283bf1 (diff) | |
| parent | e93c9c99a629c61837d5a7fc2120cd2b6c70dbdd (diff) | |
| download | lwn-2a267e7c41aa88215de2b542de797d03d16ecdfd.tar.gz lwn-2a267e7c41aa88215de2b542de797d03d16ecdfd.zip | |
Merge tag 'v5.1' into next
Sync up with mainline to bring in the latest APIs.
Diffstat (limited to 'virt')
| -rw-r--r-- | virt/kvm/arm/arch_timer.c | 640 | ||||
| -rw-r--r-- | virt/kvm/arm/arm.c | 132 | ||||
| -rw-r--r-- | virt/kvm/arm/hyp/vgic-v3-sr.c | 12 | ||||
| -rw-r--r-- | virt/kvm/arm/mmio.c | 11 | ||||
| -rw-r--r-- | virt/kvm/arm/mmu.c | 490 | ||||
| -rw-r--r-- | virt/kvm/arm/psci.c | 36 | ||||
| -rw-r--r-- | virt/kvm/arm/trace.h | 125 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-debug.c | 4 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-init.c | 30 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-its.c | 53 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-mmio-v2.c | 14 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-mmio-v3.c | 15 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-mmio.c | 78 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-v2.c | 4 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic-v3.c | 16 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic.c | 174 | ||||
| -rw-r--r-- | virt/kvm/arm/vgic/vgic.h | 1 | ||||
| -rw-r--r-- | virt/kvm/async_pf.c | 2 | ||||
| -rw-r--r-- | virt/kvm/coalesced_mmio.c | 3 | ||||
| -rw-r--r-- | virt/kvm/eventfd.c | 13 | ||||
| -rw-r--r-- | virt/kvm/irqchip.c | 9 | ||||
| -rw-r--r-- | virt/kvm/kvm_main.c | 340 | ||||
| -rw-r--r-- | virt/kvm/vfio.c | 4 |
23 files changed, 1479 insertions, 727 deletions
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 17cecc96f735..7fc272ecae16 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -25,6 +25,7 @@ #include <clocksource/arm_arch_timer.h> #include <asm/arch_timer.h> +#include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <kvm/arm_vgic.h> @@ -34,7 +35,9 @@ static struct timecounter *timecounter; static unsigned int host_vtimer_irq; +static unsigned int host_ptimer_irq; static u32 host_vtimer_irq_flags; +static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); @@ -52,12 +55,34 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx); static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val); +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg); u64 kvm_phys_timer_read(void) { return timecounter->cc->read(timecounter->cc); } +static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +{ + if (has_vhe()) { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = vcpu_ptimer(vcpu); + map->emul_ptimer = NULL; + } else { + map->direct_vtimer = vcpu_vtimer(vcpu); + map->direct_ptimer = NULL; + map->emul_ptimer = vcpu_ptimer(vcpu); + } + + trace_kvm_get_timer_map(vcpu->vcpu_id, map); +} + static inline bool userspace_irqchip(struct kvm *kvm) { return static_branch_unlikely(&userspace_irqchip_in_use) && @@ -70,30 +95,35 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns) HRTIMER_MODE_ABS); } -static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) +static void soft_timer_cancel(struct hrtimer *hrt) { hrtimer_cancel(hrt); - if (work) - cancel_work_sync(work); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; - struct arch_timer_context *vtimer; + struct arch_timer_context *ctx; + struct timer_map map; /* * We may see a timer interrupt after vcpu_put() has been called which * sets the CPU's vcpu pointer to NULL, because even though the timer - * has been disabled in vtimer_save_state(), the hardware interrupt + * has been disabled in timer_save_state(), the hardware interrupt * signal may not have been retired from the interrupt controller yet. */ if (!vcpu) return IRQ_HANDLED; - vtimer = vcpu_vtimer(vcpu); - if (kvm_timer_should_fire(vtimer)) - kvm_timer_update_irq(vcpu, true, vtimer); + get_timer_map(vcpu, &map); + + if (irq == host_vtimer_irq) + ctx = map.direct_vtimer; + else + ctx = map.direct_ptimer; + + if (kvm_timer_should_fire(ctx)) + kvm_timer_update_irq(vcpu, true, ctx); if (userspace_irqchip(vcpu->kvm) && !static_branch_unlikely(&has_gic_active_state)) @@ -102,23 +132,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) return IRQ_HANDLED; } -/* - * Work function for handling the backup timer that we schedule when a vcpu is - * no longer running, but had a timer programmed to fire in the future. - */ -static void kvm_timer_inject_irq_work(struct work_struct *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired); - - /* - * If the vcpu is blocked we want to wake it up so that it will see - * the timer has expired when entering the guest. - */ - kvm_vcpu_wake_up(vcpu); -} - static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) { u64 cval, now; @@ -141,7 +154,9 @@ static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx) static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) { - return !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && + WARN_ON(timer_ctx && timer_ctx->loaded); + return timer_ctx && + !(timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_IT_MASK) && (timer_ctx->cnt_ctl & ARCH_TIMER_CTRL_ENABLE); } @@ -151,21 +166,22 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) */ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) { - u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + u64 min_delta = ULLONG_MAX; + int i; - if (kvm_timer_irq_can_fire(vtimer)) - min_virt = kvm_timer_compute_delta(vtimer); + for (i = 0; i < NR_KVM_TIMERS; i++) { + struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i]; - if (kvm_timer_irq_can_fire(ptimer)) - min_phys = kvm_timer_compute_delta(ptimer); + WARN(ctx->loaded, "timer %d loaded\n", i); + if (kvm_timer_irq_can_fire(ctx)) + min_delta = min(min_delta, kvm_timer_compute_delta(ctx)); + } /* If none of timers can fire, then return 0 */ - if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) + if (min_delta == ULLONG_MAX) return 0; - return min(min_virt, min_phys); + return min_delta; } static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) @@ -188,45 +204,62 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) return HRTIMER_RESTART; } - schedule_work(&timer->expired); + kvm_vcpu_wake_up(vcpu); return HRTIMER_NORESTART; } -static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) +static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt) { - struct arch_timer_context *ptimer; - struct arch_timer_cpu *timer; + struct arch_timer_context *ctx; struct kvm_vcpu *vcpu; u64 ns; - timer = container_of(hrt, struct arch_timer_cpu, phys_timer); - vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); - ptimer = vcpu_ptimer(vcpu); + ctx = container_of(hrt, struct arch_timer_context, hrtimer); + vcpu = ctx->vcpu; + + trace_kvm_timer_hrtimer_expire(ctx); /* * Check that the timer has really expired from the guest's * PoV (NTP on the host may have forced it to expire * early). If not ready, schedule for a later time. */ - ns = kvm_timer_compute_delta(ptimer); + ns = kvm_timer_compute_delta(ctx); if (unlikely(ns)) { hrtimer_forward_now(hrt, ns_to_ktime(ns)); return HRTIMER_RESTART; } - kvm_timer_update_irq(vcpu, true, ptimer); + kvm_timer_update_irq(vcpu, true, ctx); return HRTIMER_NORESTART; } static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { + enum kvm_arch_timers index; u64 cval, now; + if (!timer_ctx) + return false; + + index = arch_timer_ctx_index(timer_ctx); + if (timer_ctx->loaded) { - u32 cnt_ctl; + u32 cnt_ctl = 0; + + switch (index) { + case TIMER_VTIMER: + cnt_ctl = read_sysreg_el0(cntv_ctl); + break; + case TIMER_PTIMER: + cnt_ctl = read_sysreg_el0(cntp_ctl); + break; + case NR_KVM_TIMERS: + /* GCC is braindead */ + cnt_ctl = 0; + break; + } - /* Only the virtual timer can be loaded so far */ - cnt_ctl = read_sysreg_el0(cntv_ctl); return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); @@ -243,13 +276,13 @@ static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct timer_map map; - if (kvm_timer_should_fire(vtimer)) - return true; + get_timer_map(vcpu, &map); - return kvm_timer_should_fire(ptimer); + return kvm_timer_should_fire(map.direct_vtimer) || + kvm_timer_should_fire(map.direct_ptimer) || + kvm_timer_should_fire(map.emul_ptimer); } /* @@ -288,77 +321,70 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, } } -/* Schedule the background timer for the emulated timer. */ -static void phys_timer_emulate(struct kvm_vcpu *vcpu) +static void timer_emulate(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + bool should_fire = kvm_timer_should_fire(ctx); + + trace_kvm_timer_emulate(ctx, should_fire); + + if (should_fire) { + kvm_timer_update_irq(ctx->vcpu, true, ctx); + return; + } /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, * then we also don't need a soft timer. */ - if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { - soft_timer_cancel(&timer->phys_timer, NULL); + if (!kvm_timer_irq_can_fire(ctx)) { + soft_timer_cancel(&ctx->hrtimer); return; } - soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); + soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx)); } -/* - * Check if there was a change in the timer state, so that we should either - * raise or lower the line level to the GIC or schedule a background timer to - * emulate the physical timer. - */ -static void kvm_timer_update_state(struct kvm_vcpu *vcpu) +static void timer_save_state(struct arch_timer_context *ctx) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - bool level; + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); + unsigned long flags; - if (unlikely(!timer->enabled)) + if (!timer->enabled) return; - /* - * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part - * of its lifecycle is offloaded to the hardware, and we therefore may - * not have lowered the irq.level value before having to signal a new - * interrupt, but have to signal an interrupt every time the level is - * asserted. - */ - level = kvm_timer_should_fire(vtimer); - kvm_timer_update_irq(vcpu, level, vtimer); + local_irq_save(flags); - phys_timer_emulate(vcpu); + if (!ctx->loaded) + goto out; - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); -} + switch (index) { + case TIMER_VTIMER: + ctx->cnt_ctl = read_sysreg_el0(cntv_ctl); + ctx->cnt_cval = read_sysreg_el0(cntv_cval); -static void vtimer_save_state(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - unsigned long flags; + /* Disable the timer */ + write_sysreg_el0(0, cntv_ctl); + isb(); - local_irq_save(flags); + break; + case TIMER_PTIMER: + ctx->cnt_ctl = read_sysreg_el0(cntp_ctl); + ctx->cnt_cval = read_sysreg_el0(cntp_cval); - if (!vtimer->loaded) - goto out; + /* Disable the timer */ + write_sysreg_el0(0, cntp_ctl); + isb(); - if (timer->enabled) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); + break; + case NR_KVM_TIMERS: + BUG(); } - /* Disable the virtual timer */ - write_sysreg_el0(0, cntv_ctl); - isb(); + trace_kvm_timer_save_state(ctx); - vtimer->loaded = false; + ctx->loaded = false; out: local_irq_restore(flags); } @@ -368,67 +394,72 @@ out: * thread is removed from its waitqueue and made runnable when there's a timer * interrupt to handle. */ -void kvm_timer_schedule(struct kvm_vcpu *vcpu) +static void kvm_timer_blocking(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; - vtimer_save_state(vcpu); + get_timer_map(vcpu, &map); /* - * No need to schedule a background timer if any guest timer has - * already expired, because kvm_vcpu_block will return before putting - * the thread to sleep. - */ - if (kvm_timer_should_fire(vtimer) || kvm_timer_should_fire(ptimer)) - return; - - /* - * If both timers are not capable of raising interrupts (disabled or + * If no timers are capable of raising interrupts (disabled or * masked), then there's no more work for us to do. */ - if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) + if (!kvm_timer_irq_can_fire(map.direct_vtimer) && + !kvm_timer_irq_can_fire(map.direct_ptimer) && + !kvm_timer_irq_can_fire(map.emul_ptimer)) return; /* - * The guest timers have not yet expired, schedule a background timer. + * At least one guest time will expire. Schedule a background timer. * Set the earliest expiration time among the guest timers. */ soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); } -static void vtimer_restore_state(struct kvm_vcpu *vcpu) +static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + + soft_timer_cancel(&timer->bg_timer); +} + +static void timer_restore_state(struct arch_timer_context *ctx) +{ + struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu); + enum kvm_arch_timers index = arch_timer_ctx_index(ctx); unsigned long flags; + if (!timer->enabled) + return; + local_irq_save(flags); - if (vtimer->loaded) + if (ctx->loaded) goto out; - if (timer->enabled) { - write_sysreg_el0(vtimer->cnt_cval, cntv_cval); + switch (index) { + case TIMER_VTIMER: + write_sysreg_el0(ctx->cnt_cval, cntv_cval); + isb(); + write_sysreg_el0(ctx->cnt_ctl, cntv_ctl); + break; + case TIMER_PTIMER: + write_sysreg_el0(ctx->cnt_cval, cntp_cval); isb(); - write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); + write_sysreg_el0(ctx->cnt_ctl, cntp_ctl); + break; + case NR_KVM_TIMERS: + BUG(); } - vtimer->loaded = true; + trace_kvm_timer_restore_state(ctx); + + ctx->loaded = true; out: local_irq_restore(flags); } -void kvm_timer_unschedule(struct kvm_vcpu *vcpu) -{ - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - vtimer_restore_state(vcpu); - - soft_timer_cancel(&timer->bg_timer, &timer->expired); -} - static void set_cntvoff(u64 cntvoff) { u32 low = lower_32_bits(cntvoff); @@ -444,23 +475,32 @@ static void set_cntvoff(u64 cntvoff) kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); } -static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) +static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active) { int r; - r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); + r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active); WARN_ON(r); } -static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) +static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - bool phys_active; + struct kvm_vcpu *vcpu = ctx->vcpu; + bool phys_active = false; + + /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx); if (irqchip_in_kernel(vcpu->kvm)) - phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); - else - phys_active = vtimer->irq.level; - set_vtimer_irq_phys_active(vcpu, phys_active); + phys_active = kvm_vgic_map_is_active(vcpu, ctx->irq.irq); + + phys_active |= ctx->irq.level; + + set_timer_irq_phys_active(ctx, phys_active); } static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) @@ -468,6 +508,14 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); /* + * Update the timer output so that it is likely to match the + * state we're about to restore. If the timer expires between + * this point and the register restoration, we'll take the + * interrupt anyway. + */ + kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer); + + /* * When using a userspace irqchip with the architected timers and a * host interrupt controller that doesn't support an active state, we * must still prevent continuously exiting from the guest, and @@ -485,28 +533,32 @@ static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; if (unlikely(!timer->enabled)) return; - if (static_branch_likely(&has_gic_active_state)) - kvm_timer_vcpu_load_gic(vcpu); - else + get_timer_map(vcpu, &map); + + if (static_branch_likely(&has_gic_active_state)) { + kvm_timer_vcpu_load_gic(map.direct_vtimer); + if (map.direct_ptimer) + kvm_timer_vcpu_load_gic(map.direct_ptimer); + } else { kvm_timer_vcpu_load_nogic(vcpu); + } - set_cntvoff(vtimer->cntvoff); + set_cntvoff(map.direct_vtimer->cntvoff); - vtimer_restore_state(vcpu); + kvm_timer_unblocking(vcpu); - /* Set the background timer for the physical timer emulation. */ - phys_timer_emulate(vcpu); + timer_restore_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_restore_state(map.direct_ptimer); - /* If the timer fired while we weren't running, inject it now */ - if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) - kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + if (map.emul_ptimer) + timer_emulate(map.emul_ptimer); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) @@ -528,15 +580,20 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; if (unlikely(!timer->enabled)) return; - vtimer_save_state(vcpu); + get_timer_map(vcpu, &map); + + timer_save_state(map.direct_vtimer); + if (map.direct_ptimer) + timer_save_state(map.direct_ptimer); /* - * Cancel the physical timer emulation, because the only case where we + * Cancel soft timer emulation, because the only case where we * need it after a vcpu_put is in the context of a sleeping VCPU, and * in that case we already factor in the deadline for the physical * timer when scheduling the bg_timer. @@ -544,7 +601,11 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * In any case, we re-schedule the hrtimer for the physical timer when * coming back to the VCPU thread in kvm_timer_vcpu_load(). */ - soft_timer_cancel(&timer->phys_timer, NULL); + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); + + if (swait_active(kvm_arch_vcpu_wq(vcpu))) + kvm_timer_blocking(vcpu); /* * The kernel may decide to run userspace after calling vcpu_put, so @@ -553,8 +614,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) * counter of non-VHE case. For VHE, the virtual counter uses a fixed * virtual offset of zero, so no need to zero CNTVOFF_EL2 register. */ - if (!has_vhe()) - set_cntvoff(0); + set_cntvoff(0); } /* @@ -569,7 +629,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) if (!kvm_timer_should_fire(vtimer)) { kvm_timer_update_irq(vcpu, false, vtimer); if (static_branch_likely(&has_gic_active_state)) - set_vtimer_irq_phys_active(vcpu, false); + set_timer_irq_phys_active(vtimer, false); else enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } @@ -577,7 +637,7 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); if (unlikely(!timer->enabled)) return; @@ -588,9 +648,10 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; + + get_timer_map(vcpu, &map); /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -598,12 +659,22 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) * resets the timer to be disabled and unmasked and is compliant with * the ARMv7 architecture. */ - vtimer->cnt_ctl = 0; - ptimer->cnt_ctl = 0; - kvm_timer_update_state(vcpu); + vcpu_vtimer(vcpu)->cnt_ctl = 0; + vcpu_ptimer(vcpu)->cnt_ctl = 0; - if (timer->enabled && irqchip_in_kernel(vcpu->kvm)) - kvm_vgic_reset_mapped_irq(vcpu, vtimer->irq.irq); + if (timer->enabled) { + kvm_timer_update_irq(vcpu, false, vcpu_vtimer(vcpu)); + kvm_timer_update_irq(vcpu, false, vcpu_ptimer(vcpu)); + + if (irqchip_in_kernel(vcpu->kvm)) { + kvm_vgic_reset_mapped_irq(vcpu, map.direct_vtimer->irq.irq); + if (map.direct_ptimer) + kvm_vgic_reset_mapped_irq(vcpu, map.direct_ptimer->irq.irq); + } + } + + if (map.emul_ptimer) + soft_timer_cancel(&map.emul_ptimer->hrtimer); return 0; } @@ -629,57 +700,71 @@ static void update_vtimer_cntvoff(struct kvm_vcpu *vcpu, u64 cntvoff) void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_cpu *timer = vcpu_timer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); /* Synchronize cntvoff across all vtimers of a VM. */ update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); - vcpu_ptimer(vcpu)->cntvoff = 0; + ptimer->cntvoff = 0; - INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); timer->bg_timer.function = kvm_bg_timer_expire; - hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->phys_timer.function = kvm_phys_timer_expire; + hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + vtimer->hrtimer.function = kvm_hrtimer_expire; + ptimer->hrtimer.function = kvm_hrtimer_expire; vtimer->irq.irq = default_vtimer_irq.irq; ptimer->irq.irq = default_ptimer_irq.irq; + + vtimer->host_timer_irq = host_vtimer_irq; + ptimer->host_timer_irq = host_ptimer_irq; + + vtimer->host_timer_irq_flags = host_vtimer_irq_flags; + ptimer->host_timer_irq_flags = host_ptimer_irq_flags; + + vtimer->vcpu = vcpu; + ptimer->vcpu = vcpu; } static void kvm_timer_init_interrupt(void *info) { enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct arch_timer_context *timer; switch (regid) { case KVM_REG_ARM_TIMER_CTL: - vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_TIMER_CNT: + timer = vcpu_vtimer(vcpu); update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); break; case KVM_REG_ARM_TIMER_CVAL: - vtimer->cnt_cval = value; + timer = vcpu_vtimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; case KVM_REG_ARM_PTIMER_CTL: - ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value); break; case KVM_REG_ARM_PTIMER_CVAL: - ptimer->cnt_cval = value; + timer = vcpu_ptimer(vcpu); + kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value); break; default: return -1; } - kvm_timer_update_state(vcpu); return 0; } @@ -699,26 +784,113 @@ static u64 read_timer_ctl(struct arch_timer_context *timer) u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) { - struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - switch (regid) { case KVM_REG_ARM_TIMER_CTL: - return read_timer_ctl(vtimer); + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_TIMER_CNT: - return kvm_phys_timer_read() - vtimer->cntvoff; + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); case KVM_REG_ARM_TIMER_CVAL: - return vtimer->cnt_cval; + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CVAL); case KVM_REG_ARM_PTIMER_CTL: - return read_timer_ctl(ptimer); - case KVM_REG_ARM_PTIMER_CVAL: - return ptimer->cnt_cval; + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CTL); case KVM_REG_ARM_PTIMER_CNT: - return kvm_phys_timer_read(); + return kvm_arm_timer_read(vcpu, + vcpu_vtimer(vcpu), TIMER_REG_CNT); + case KVM_REG_ARM_PTIMER_CVAL: + return kvm_arm_timer_read(vcpu, + vcpu_ptimer(vcpu), TIMER_REG_CVAL); } return (u64)-1; } +static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + switch (treg) { + case TIMER_REG_TVAL: + val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff; + break; + + case TIMER_REG_CTL: + val = read_timer_ctl(timer); + break; + + case TIMER_REG_CVAL: + val = timer->cnt_cval; + break; + + case TIMER_REG_CNT: + val = kvm_phys_timer_read() - timer->cntvoff; + break; + + default: + BUG(); + } + + return val; +} + +u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg) +{ + u64 val; + + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); + + return val; +} + +static void kvm_arm_timer_write(struct kvm_vcpu *vcpu, + struct arch_timer_context *timer, + enum kvm_arch_timer_regs treg, + u64 val) +{ + switch (treg) { + case TIMER_REG_TVAL: + timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val; + break; + + case TIMER_REG_CTL: + timer->cnt_ctl = val & ~ARCH_TIMER_CTRL_IT_STAT; + break; + + case TIMER_REG_CVAL: + timer->cnt_cval = val; + break; + + default: + BUG(); + } +} + +void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu, + enum kvm_arch_timers tmr, + enum kvm_arch_timer_regs treg, + u64 val) +{ + preempt_disable(); + kvm_timer_vcpu_put(vcpu); + + kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val); + + kvm_timer_vcpu_load(vcpu); + preempt_enable(); +} + static int kvm_timer_starting_cpu(unsigned int cpu) { kvm_timer_init_interrupt(NULL); @@ -744,6 +916,8 @@ int kvm_timer_hyp_init(bool has_gic) return -ENODEV; } + /* First, do the virtual EL1 timer irq */ + if (info->virtual_irq <= 0) { kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n", info->virtual_irq); @@ -754,15 +928,15 @@ int kvm_timer_hyp_init(bool has_gic) host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { - kvm_err("Invalid trigger for IRQ%d, assuming level low\n", + kvm_err("Invalid trigger for vtimer IRQ%d, assuming level low\n", host_vtimer_irq); host_vtimer_irq_flags = IRQF_TRIGGER_LOW; } err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, - "kvm guest timer", kvm_get_running_vcpus()); + "kvm guest vtimer", kvm_get_running_vcpus()); if (err) { - kvm_err("kvm_arch_timer: can't request interrupt %d (%d)\n", + kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n", host_vtimer_irq, err); return err; } @@ -780,6 +954,43 @@ int kvm_timer_hyp_init(bool has_gic) kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq); + /* Now let's do the physical EL1 timer irq */ + + if (info->physical_irq > 0) { + host_ptimer_irq = info->physical_irq; + host_ptimer_irq_flags = irq_get_trigger_type(host_ptimer_irq); + if (host_ptimer_irq_flags != IRQF_TRIGGER_HIGH && + host_ptimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for ptimer IRQ%d, assuming level low\n", + host_ptimer_irq); + host_ptimer_irq_flags = IRQF_TRIGGER_LOW; + } + + err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler, + "kvm guest ptimer", kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", + host_ptimer_irq, err); + return err; + } + + if (has_gic) { + err = irq_set_vcpu_affinity(host_ptimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + } + + kvm_debug("physical timer IRQ%d\n", host_ptimer_irq); + } else if (has_vhe()) { + kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", + info->physical_irq); + err = -ENODEV; + goto out_free_irq; + } + cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, "kvm/arm/timer:starting", kvm_timer_starting_cpu, kvm_timer_dying_cpu); @@ -791,12 +1002,9 @@ out_free_irq: void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(vcpu); - soft_timer_cancel(&timer->bg_timer, &timer->expired); - soft_timer_cancel(&timer->phys_timer, NULL); - kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); + soft_timer_cancel(&timer->bg_timer); } static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) @@ -830,16 +1038,18 @@ bool kvm_arch_timer_get_input_level(int vintid) if (vintid == vcpu_vtimer(vcpu)->irq.irq) timer = vcpu_vtimer(vcpu); + else if (vintid == vcpu_ptimer(vcpu)->irq.irq) + timer = vcpu_ptimer(vcpu); else - BUG(); /* We only map the vtimer so far */ + BUG(); return kvm_timer_should_fire(timer); } int kvm_timer_enable(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_cpu *timer = vcpu_timer(vcpu); + struct timer_map map; int ret; if (timer->enabled) @@ -857,19 +1067,33 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, + get_timer_map(vcpu, &map); + + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_vtimer->host_timer_irq, + map.direct_vtimer->irq.irq, kvm_arch_timer_get_input_level); if (ret) return ret; + if (map.direct_ptimer) { + ret = kvm_vgic_map_phys_irq(vcpu, + map.direct_ptimer->host_timer_irq, + map.direct_ptimer->irq.irq, + kvm_arch_timer_get_input_level); + } + + if (ret) + return ret; + no_vgic: timer->enabled = 1; return 0; } /* - * On VHE system, we only need to configure trap on physical timer and counter - * accesses in EL0 and EL1 once, not for every world switch. + * On VHE system, we only need to configure the EL2 timer trap register once, + * not for every world switch. * The host kernel runs at EL2 with HCR_EL2.TGE == 1, * and this makes those bits have no effect for the host kernel execution. */ @@ -880,11 +1104,11 @@ void kvm_timer_init_vhe(void) u64 val; /* - * Disallow physical timer access for the guest. - * Physical counter access is allowed. + * VHE systems allow the guest direct access to the EL1 physical + * timer/counter. */ val = read_sysreg(cnthctl_el2); - val &= ~(CNTHCTL_EL1PCEN << cnthctl_shift); + val |= (CNTHCTL_EL1PCEN << cnthctl_shift); val |= (CNTHCTL_EL1PCTEN << cnthctl_shift); write_sysreg(val, cnthctl_el2); } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 23774970c9df..f412ebc90610 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -65,8 +65,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); /* The VMID used in the VTTBR */ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1); static u32 kvm_next_vmid; -static unsigned int kvm_vmid_bits __read_mostly; -static DEFINE_RWLOCK(kvm_vmid_lock); +static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; @@ -142,7 +141,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_vgic_early_init(kvm); /* Mark the initial VMID generation invalid */ - kvm->arch.vmid_gen = 0; + kvm->arch.vmid.vmid_gen = 0; /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->arch.max_vcpus = vgic_present ? @@ -336,13 +335,11 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - kvm_timer_schedule(vcpu); kvm_vgic_v4_enable_doorbell(vcpu); } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - kvm_timer_unschedule(vcpu); kvm_vgic_v4_disable_doorbell(vcpu); } @@ -472,51 +469,42 @@ void force_vm_exit(const cpumask_t *mask) /** * need_new_vmid_gen - check that the VMID is still valid - * @kvm: The VM's VMID to check + * @vmid: The VMID to check * * return true if there is a new generation of VMIDs being used * - * The hardware supports only 256 values with the value zero reserved for the - * host, so we check if an assigned value belongs to a previous generation, - * which which requires us to assign a new value. If we're the first to use a - * VMID for the new generation, we must flush necessary caches and TLBs on all - * CPUs. + * The hardware supports a limited set of values with the value zero reserved + * for the host, so we check if an assigned value belongs to a previous + * generation, which which requires us to assign a new value. If we're the + * first to use a VMID for the new generation, we must flush necessary caches + * and TLBs on all CPUs. */ -static bool need_new_vmid_gen(struct kvm *kvm) +static bool need_new_vmid_gen(struct kvm_vmid *vmid) { - return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen)); + u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen); + smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */ + return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen); } /** - * update_vttbr - Update the VTTBR with a valid VMID before the guest runs - * @kvm The guest that we are about to run - * - * Called from kvm_arch_vcpu_ioctl_run before entering the guest to ensure the - * VM has a valid VMID, otherwise assigns a new one and flushes corresponding - * caches and TLBs. + * update_vmid - Update the vmid with a valid VMID for the current generation + * @kvm: The guest that struct vmid belongs to + * @vmid: The stage-2 VMID information struct */ -static void update_vttbr(struct kvm *kvm) +static void update_vmid(struct kvm_vmid *vmid) { - phys_addr_t pgd_phys; - u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0; - bool new_gen; - - read_lock(&kvm_vmid_lock); - new_gen = need_new_vmid_gen(kvm); - read_unlock(&kvm_vmid_lock); - - if (!new_gen) + if (!need_new_vmid_gen(vmid)) return; - write_lock(&kvm_vmid_lock); + spin_lock(&kvm_vmid_lock); /* * We need to re-check the vmid_gen here to ensure that if another vcpu * already allocated a valid vmid for this vm, then this vcpu should * use the same vmid. */ - if (!need_new_vmid_gen(kvm)) { - write_unlock(&kvm_vmid_lock); + if (!need_new_vmid_gen(vmid)) { + spin_unlock(&kvm_vmid_lock); return; } @@ -539,18 +527,14 @@ static void update_vttbr(struct kvm *kvm) kvm_call_hyp(__kvm_flush_vm_context); } - kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen); - kvm->arch.vmid = kvm_next_vmid; + vmid->vmid = kvm_next_vmid; kvm_next_vmid++; - kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; + kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; - /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm->arch.pgd); - BUG_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm)); - vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); - kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp; + smp_wmb(); + WRITE_ONCE(vmid->vmid_gen, atomic64_read(&kvm_vmid_gen)); - write_unlock(&kvm_vmid_lock); + spin_unlock(&kvm_vmid_lock); } static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) @@ -627,6 +611,13 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu) /* Awaken to handle a signal, request we sleep again later. */ kvm_make_request(KVM_REQ_SLEEP, vcpu); } + + /* + * Make sure we will observe a potential reset request if we've + * observed a change to the power state. Pairs with the smp_wmb() in + * kvm_psci_vcpu_on(). + */ + smp_rmb(); } static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) @@ -640,6 +631,9 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) vcpu_req_sleep(vcpu); + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + /* * Clear IRQ_PENDING requests that were made to guarantee * that a VCPU sees new virtual interrupts. @@ -674,8 +668,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) return ret; - if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) - return 0; } if (run->immediate_exit) @@ -693,7 +685,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ cond_resched(); - update_vttbr(vcpu->kvm); + update_vmid(&vcpu->kvm->arch.vmid); check_vcpu_requests(vcpu); @@ -742,7 +734,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || + if (ret <= 0 || need_new_vmid_gen(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ @@ -768,7 +760,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = kvm_vcpu_run_vhe(vcpu); kvm_arm_vhe_guest_exit(); } else { - ret = kvm_call_hyp(__kvm_vcpu_run_nvhe, vcpu); + ret = kvm_call_hyp_ret(__kvm_vcpu_run_nvhe, vcpu); } vcpu->mode = OUTSIDE_GUEST_MODE; @@ -942,7 +934,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { - unsigned int i; + unsigned int i, ret; int phys_target = kvm_target_cpu(); if (init->target != phys_target) @@ -977,9 +969,14 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, vcpu->arch.target = phys_target; /* Now we know what it is, we can reset it. */ - return kvm_reset_vcpu(vcpu); -} + ret = kvm_reset_vcpu(vcpu); + if (ret) { + vcpu->arch.target = -1; + bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + } + return ret; +} static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) @@ -1205,14 +1202,30 @@ long kvm_arch_vcpu_ioctl(struct file *filp, */ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - bool is_dirty = false; + bool flush = false; + int r; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log_protect(kvm, log, &flush); + + if (flush) + kvm_flush_remote_tlbs(kvm); + + mutex_unlock(&kvm->slots_lock); + return r; +} + +int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +{ + bool flush = false; int r; mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); + r = kvm_clear_dirty_log_protect(kvm, log, &flush); - if (is_dirty) + if (flush) kvm_flush_remote_tlbs(kvm); mutex_unlock(&kvm->slots_lock); @@ -1404,10 +1417,6 @@ static inline void hyp_cpu_pm_exit(void) static int init_common_resources(void) { - /* set size of VMID supported by CPU */ - kvm_vmid_bits = kvm_get_vmid_bits(); - kvm_info("%d-bit VMID\n", kvm_vmid_bits); - kvm_set_ipa_limit(); return 0; @@ -1548,6 +1557,7 @@ static int init_hyp_mode(void) kvm_cpu_context_t *cpu_ctxt; cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); + kvm_init_host_cpu_context(cpu_ctxt, cpu); err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { @@ -1558,7 +1568,7 @@ static int init_hyp_mode(void) err = hyp_map_aux_data(); if (err) - kvm_err("Cannot map host auxilary data: %d\n", err); + kvm_err("Cannot map host auxiliary data: %d\n", err); return 0; @@ -1640,8 +1650,10 @@ int kvm_arch_init(void *opaque) return -ENODEV; } - if (!kvm_arch_check_sve_has_vhe()) { - kvm_pr_unimpl("SVE system without VHE unsupported. Broken cpu?"); + in_hyp_mode = is_kernel_in_hyp_mode(); + + if (!in_hyp_mode && kvm_arch_requires_vhe()) { + kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); return -ENODEV; } @@ -1657,8 +1669,6 @@ int kvm_arch_init(void *opaque) if (err) return err; - in_hyp_mode = is_kernel_in_hyp_mode(); - if (!in_hyp_mode) { err = init_hyp_mode(); if (err) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 616e5a433ab0..370bd6c5e6cb 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -222,11 +222,11 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) } } - if (used_lrs) { + if (used_lrs || cpu_if->its_vpe.its_vm) { int i; u32 elrsr; - elrsr = read_gicreg(ICH_ELSR_EL2); + elrsr = read_gicreg(ICH_ELRSR_EL2); write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EN, ICH_HCR_EL2); @@ -247,7 +247,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; int i; - if (used_lrs) { + if (used_lrs || cpu_if->its_vpe.its_vm) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); for (i = 0; i < used_lrs; i++) @@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) esr = kvm_vcpu_get_hsr(vcpu); if (vcpu_mode_is_32bit(vcpu)) { - if (!kvm_condition_valid(vcpu)) + if (!kvm_condition_valid(vcpu)) { + __kvm_skip_instr(vcpu); return 1; + } sysreg = esr_cp15_to_sysreg(esr); } else { @@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu) rt = kvm_vcpu_sys_get_rt(vcpu); fn(vcpu, vmcr, rt); + __kvm_skip_instr(vcpu); + return 1; } diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index dac7ceb1a677..08443a15e6be 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } + /* + * The MMIO instruction is emulated and should not be re-executed + * in the guest. + */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 0; } @@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len) vcpu->arch.mmio_decode.sign_extend = sign_extend; vcpu->arch.mmio_decode.rt = rt; - /* - * The MMIO instruction is emulated and should not be re-executed - * in the guest. - */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 0; } diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 5eca48bdb1a6..a39dcfdbcc65 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -27,10 +27,10 @@ #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> #include <asm/kvm_mmio.h> +#include <asm/kvm_ras.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/virt.h> -#include <asm/system_misc.h> #include "trace.h" @@ -102,8 +102,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) * @addr: IPA * @pmd: pmd pointer for IPA * - * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all - * pages in the range dirty. + * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. */ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) { @@ -115,6 +114,24 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) put_page(virt_to_page(pmd)); } +/** + * stage2_dissolve_pud() - clear and flush huge PUD entry + * @kvm: pointer to kvm structure. + * @addr: IPA + * @pud: pud pointer for IPA + * + * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. + */ +static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) +{ + if (!stage2_pud_huge(kvm, *pudp)) + return; + + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + put_page(virt_to_page(pudp)); +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, int min, int max) { @@ -607,7 +624,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start, addr = start; do { pte = pte_offset_kernel(pmd, addr); - kvm_set_pte(pte, pfn_pte(pfn, prot)); + kvm_set_pte(pte, kvm_pfn_pte(pfn, prot)); get_page(virt_to_page(pte)); pfn++; } while (addr += PAGE_SIZE, addr != end); @@ -628,7 +645,7 @@ static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start, BUG_ON(pmd_sect(*pmd)); if (pmd_none(*pmd)) { - pte = pte_alloc_one_kernel(NULL, addr); + pte = pte_alloc_one_kernel(NULL); if (!pte) { kvm_err("Cannot allocate Hyp pte\n"); return -ENOMEM; @@ -880,15 +897,15 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. * - * Allocates only the stage-2 HW PGD level table(s) (can support either full - * 40-bit input addresses or limited to 32-bit input addresses). Clears the - * allocated pages. + * Allocates only the stage-2 HW PGD level table(s) of size defined by + * stage2_pgd_size(kvm). * * Note we don't need locking here as this is only called when the VM is * created, which can only be done once. */ int kvm_alloc_stage2_pgd(struct kvm *kvm) { + phys_addr_t pgd_phys; pgd_t *pgd; if (kvm->arch.pgd != NULL) { @@ -901,7 +918,12 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm) if (!pgd) return -ENOMEM; + pgd_phys = virt_to_phys(pgd); + if (WARN_ON(pgd_phys & ~kvm_vttbr_baddr_mask(kvm))) + return -EINVAL; + kvm->arch.pgd = pgd; + kvm->arch.pgd_phys = pgd_phys; return 0; } @@ -989,6 +1011,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) unmap_stage2_range(kvm, 0, kvm_phys_size(kvm)); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; + kvm->arch.pgd_phys = 0; } spin_unlock(&kvm->mmu_lock); @@ -1022,7 +1045,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (!pud) + if (!pud || stage2_pud_huge(kvm, *pud)) return NULL; if (stage2_pud_none(kvm, *pud)) { @@ -1041,25 +1064,43 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache { pmd_t *pmd, old_pmd; +retry: pmd = stage2_get_pmd(kvm, cache, addr); VM_BUG_ON(!pmd); old_pmd = *pmd; + /* + * Multiple vcpus faulting on the same PMD entry, can + * lead to them sequentially updating the PMD with the + * same value. Following the break-before-make + * (pmd_clear() followed by tlb_flush()) process can + * hinder forward progress due to refaults generated + * on missing translations. + * + * Skip updating the page table if the entry is + * unchanged. + */ + if (pmd_val(old_pmd) == pmd_val(*new_pmd)) + return 0; + if (pmd_present(old_pmd)) { /* - * Multiple vcpus faulting on the same PMD entry, can - * lead to them sequentially updating the PMD with the - * same value. Following the break-before-make - * (pmd_clear() followed by tlb_flush()) process can - * hinder forward progress due to refaults generated - * on missing translations. + * If we already have PTE level mapping for this block, + * we must unmap it to avoid inconsistent TLB state and + * leaking the table page. We could end up in this situation + * if the memory slot was marked for dirty logging and was + * reverted, leaving PTE level mappings for the pages accessed + * during the period. So, unmap the PTE level mapping for this + * block and retry, as we could have released the upper level + * table in the process. * - * Skip updating the page table if the entry is - * unchanged. + * Normal THP split/merge follows mmu_notifier callbacks and do + * get handled accordingly. */ - if (pmd_val(old_pmd) == pmd_val(*new_pmd)) - return 0; - + if (!pmd_thp_or_huge(old_pmd)) { + unmap_stage2_range(kvm, addr & S2_PMD_MASK, S2_PMD_SIZE); + goto retry; + } /* * Mapping in huge pages should only happen through a * fault. If a page is merged into a transparent huge @@ -1071,8 +1112,7 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache * should become splitting first, unmapped, merged, * and mapped back in on-demand. */ - VM_BUG_ON(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); - + WARN_ON_ONCE(pmd_pfn(old_pmd) != pmd_pfn(*new_pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); } else { @@ -1083,29 +1123,113 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } -static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, + phys_addr_t addr, const pud_t *new_pudp) +{ + pud_t *pudp, old_pud; + +retry: + pudp = stage2_get_pud(kvm, cache, addr); + VM_BUG_ON(!pudp); + + old_pud = *pudp; + + /* + * A large number of vcpus faulting on the same stage 2 entry, + * can lead to a refault due to the stage2_pud_clear()/tlb_flush(). + * Skip updating the page tables if there is no change. + */ + if (pud_val(old_pud) == pud_val(*new_pudp)) + return 0; + + if (stage2_pud_present(kvm, old_pud)) { + /* + * If we already have table level mapping for this block, unmap + * the range for this block and retry. + */ + if (!stage2_pud_huge(kvm, old_pud)) { + unmap_stage2_range(kvm, addr & S2_PUD_MASK, S2_PUD_SIZE); + goto retry; + } + + WARN_ON_ONCE(kvm_pud_pfn(old_pud) != kvm_pud_pfn(*new_pudp)); + stage2_pud_clear(kvm, pudp); + kvm_tlb_flush_vmid_ipa(kvm, addr); + } else { + get_page(virt_to_page(pudp)); + } + + kvm_set_pud(pudp, *new_pudp); + return 0; +} + +/* + * stage2_get_leaf_entry - walk the stage2 VM page tables and return + * true if a valid and present leaf-entry is found. A pointer to the + * leaf-entry is returned in the appropriate level variable - pudpp, + * pmdpp, ptepp. + */ +static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr, + pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp) { + pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - pmdp = stage2_get_pmd(kvm, NULL, addr); + *pudpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pudp = stage2_get_pud(kvm, NULL, addr); + if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp)) + return false; + + if (stage2_pud_huge(kvm, *pudp)) { + *pudpp = pudp; + return true; + } + + pmdp = stage2_pmd_offset(kvm, pudp, addr); if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) return false; - if (pmd_thp_or_huge(*pmdp)) - return kvm_s2pmd_exec(pmdp); + if (pmd_thp_or_huge(*pmdp)) { + *pmdpp = pmdp; + return true; + } ptep = pte_offset_kernel(pmdp, addr); if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) return false; - return kvm_s2pte_exec(ptep); + *ptepp = ptep; + return true; +} + +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + bool found; + + found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep); + if (!found) + return false; + + if (pudp) + return kvm_s2pud_exec(pudp); + else if (pmdp) + return kvm_s2pmd_exec(pmdp); + else + return kvm_s2pte_exec(ptep); } static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pte_t *new_pte, unsigned long flags) { + pud_t *pud; pmd_t *pmd; pte_t *pte, old_pte; bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP; @@ -1114,7 +1238,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, VM_BUG_ON(logging_active && !cache); /* Create stage-2 page table mapping - Levels 0 and 1 */ - pmd = stage2_get_pmd(kvm, cache, addr); + pud = stage2_get_pud(kvm, cache, addr); + if (!pud) { + /* + * Ignore calls from kvm_set_spte_hva for unallocated + * address ranges. + */ + return 0; + } + + /* + * While dirty page logging - dissolve huge PUD, then continue + * on to allocate page. + */ + if (logging_active) + stage2_dissolve_pud(kvm, addr, pud); + + if (stage2_pud_none(kvm, *pud)) { + if (!cache) + return 0; /* ignore calls from kvm_set_spte_hva */ + pmd = mmu_memory_cache_alloc(cache); + stage2_pud_populate(kvm, pud, pmd); + get_page(virt_to_page(pud)); + } + + pmd = stage2_pmd_offset(kvm, pud, addr); if (!pmd) { /* * Ignore calls from kvm_set_spte_hva for unallocated @@ -1182,6 +1330,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) return stage2_ptep_test_and_clear_young((pte_t *)pmd); } +static int stage2_pudp_test_and_clear_young(pud_t *pud) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pud); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -1202,7 +1355,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pfn = __phys_to_pfn(pa); for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) { - pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); + pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) pte = kvm_s2pte_mkwrite(pte); @@ -1234,7 +1387,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) struct page *page = pfn_to_page(pfn); /* - * PageTransCompoungMap() returns true for THP and + * PageTransCompoundMap() returns true for THP and * hugetlbfs. Make sure the adjustment is done only for THP * pages. */ @@ -1274,14 +1427,6 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) return false; } -static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_trap_is_iabt(vcpu)) - return false; - - return kvm_vcpu_dabt_iswrite(vcpu); -} - /** * stage2_wp_ptes - write protect PMD range * @pmd: pointer to pmd entry @@ -1330,13 +1475,11 @@ static void stage2_wp_pmds(struct kvm *kvm, pud_t *pud, } /** - * stage2_wp_puds - write protect PGD range - * @pgd: pointer to pgd entry - * @addr: range start address - * @end: range end address - * - * Process PUD entries, for a huge PUD we cause a panic. - */ + * stage2_wp_puds - write protect PGD range + * @pgd: pointer to pgd entry + * @addr: range start address + * @end: range end address + */ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr, phys_addr_t end) { @@ -1347,9 +1490,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd, do { next = stage2_pud_addr_end(kvm, addr, end); if (!stage2_pud_none(kvm, *pud)) { - /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(stage2_pud_huge(kvm, *pud)); - stage2_wp_pmds(kvm, pud, addr, next); + if (stage2_pud_huge(kvm, *pud)) { + if (!kvm_s2pud_readonly(pud)) + kvm_set_s2pud_readonly(pud); + } else { + stage2_wp_pmds(kvm, pud, addr, next); + } } } while (pud++, addr = next, addr != end); } @@ -1392,7 +1538,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) * * Called to start logging dirty pages after memory region * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns - * all present PMD and PTEs are write protected in the memory region. + * all present PUD, PMD and PTEs are write protected in the memory region. * Afterwards read of dirty page log can be called. * * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, @@ -1470,12 +1616,70 @@ static void kvm_send_hwpoison_signal(unsigned long address, send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current); } +static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva, + unsigned long map_size) +{ + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2 + * PMD/PUD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh Stage-1 block | Stage-1 block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SIZE: + * +---+--------------------+--------------------+-----+ + * |abc|def Stage-2 block | Stage-2 block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those stage-2 blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva & ~(map_size - 1)) >= uaddr_start && + (hva & ~(map_size - 1)) + map_size <= uaddr_end; +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) { int ret; - bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; + bool write_fault, writable, force_pte = false; + bool exec_fault, needs_exec; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1484,7 +1688,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_pfn_t pfn; pgprot_t mem_type = PAGE_S2; bool logging_active = memslot_is_logging(memslot); - unsigned long flags = 0; + unsigned long vma_pagesize, flags = 0; write_fault = kvm_is_write_fault(vcpu); exec_fault = kvm_vcpu_trap_is_iabt(vcpu); @@ -1504,23 +1708,23 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { - hugetlb = true; - gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; - } else { - /* - * Pages belonging to memslots that don't have the same - * alignment for userspace and IPA cannot be mapped using - * block descriptors even if the pages belong to a THP for - * the process, because the stage-2 block descriptor will - * cover more than a single THP and we loose atomicity for - * unmapping, updates, and splits of the THP or other pages - * in the stage-2 block range. - */ - if ((memslot->userspace_addr & ~PMD_MASK) != - ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK)) - force_pte = true; + vma_pagesize = vma_kernel_pagesize(vma); + if (logging_active || + !fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) { + force_pte = true; + vma_pagesize = PAGE_SIZE; } + + /* + * The stage2 has a minimum of 2 level table (For arm64 see + * kvm_arm_setup_stage2()). Hence, we are guaranteed that we can + * use PMD_SIZE huge mappings (even when the PMD is folded into PGD). + * As for PUD huge maps, we must make sure that we have at least + * 3 levels, i.e, PMD is not folded. + */ + if (vma_pagesize == PMD_SIZE || + (vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) + gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT; up_read(¤t->mm->mmap_sem); /* We need minimum second+third level pages */ @@ -1558,7 +1762,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * should not be mapped with huge pages (it introduces churn * and performance degradation), so force a pte mapping. */ - force_pte = true; flags |= KVM_S2_FLAG_LOGGING_ACTIVE; /* @@ -1573,50 +1776,73 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (mmu_notifier_retry(kvm, mmu_seq)) goto out_unlock; - if (!hugetlb && !force_pte) - hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !force_pte) { + /* + * Only PMD_SIZE transparent hugepages(THP) are + * currently supported. This code will need to be + * updated to support other THP sizes. + * + * Make sure the host VA and the guest IPA are sufficiently + * aligned and that the block is contained within the memslot. + */ + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && + transparent_hugepage_adjust(&pfn, &fault_ipa)) + vma_pagesize = PMD_SIZE; + } + + if (writable) + kvm_set_pfn_dirty(pfn); - if (hugetlb) { - pmd_t new_pmd = pfn_pmd(pfn, mem_type); - new_pmd = pmd_mkhuge(new_pmd); - if (writable) { - new_pmd = kvm_s2pmd_mkwrite(new_pmd); - kvm_set_pfn_dirty(pfn); - } + if (fault_status != FSC_PERM) + clean_dcache_guest_page(pfn, vma_pagesize); + + if (exec_fault) + invalidate_icache_guest_page(pfn, vma_pagesize); + + /* + * If we took an execution fault we have made the + * icache/dcache coherent above and should now let the s2 + * mapping be executable. + * + * Write faults (!exec_fault && FSC_PERM) are orthogonal to + * execute permissions, and we preserve whatever we have. + */ + needs_exec = exec_fault || + (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa)); - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PMD_SIZE); + if (vma_pagesize == PUD_SIZE) { + pud_t new_pud = kvm_pfn_pud(pfn, mem_type); - if (exec_fault) { + new_pud = kvm_pud_mkhuge(new_pud); + if (writable) + new_pud = kvm_s2pud_mkwrite(new_pud); + + if (needs_exec) + new_pud = kvm_s2pud_mkexec(new_pud); + + ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud); + } else if (vma_pagesize == PMD_SIZE) { + pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type); + + new_pmd = kvm_pmd_mkhuge(new_pmd); + + if (writable) + new_pmd = kvm_s2pmd_mkwrite(new_pmd); + + if (needs_exec) new_pmd = kvm_s2pmd_mkexec(new_pmd); - invalidate_icache_guest_page(pfn, PMD_SIZE); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pmd = kvm_s2pmd_mkexec(new_pmd); - } ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { - pte_t new_pte = pfn_pte(pfn, mem_type); + pte_t new_pte = kvm_pfn_pte(pfn, mem_type); if (writable) { new_pte = kvm_s2pte_mkwrite(new_pte); - kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - if (fault_status != FSC_PERM) - clean_dcache_guest_page(pfn, PAGE_SIZE); - - if (exec_fault) { + if (needs_exec) new_pte = kvm_s2pte_mkexec(new_pte); - invalidate_icache_guest_page(pfn, PAGE_SIZE); - } else if (fault_status == FSC_PERM) { - /* Preserve execute if XN was already cleared */ - if (stage2_is_exec(kvm, fault_ipa)) - new_pte = kvm_s2pte_mkexec(new_pte); - } ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } @@ -1637,6 +1863,7 @@ out_unlock: */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + pud_t *pud; pmd_t *pmd; pte_t *pte; kvm_pfn_t pfn; @@ -1646,24 +1873,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) spin_lock(&vcpu->kvm->mmu_lock); - pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte)) goto out; - if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ + if (pud) { /* HugeTLB */ + *pud = kvm_s2pud_mkyoung(*pud); + pfn = kvm_pud_pfn(*pud); + pfn_valid = true; + } else if (pmd) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; - goto out; + } else { + *pte = pte_mkyoung(*pte); /* Just a page... */ + pfn = pte_pfn(*pte); + pfn_valid = true; } - pte = pte_offset_kernel(pmd, fault_ipa); - if (pte_none(*pte)) /* Nothing there either */ - goto out; - - *pte = pte_mkyoung(*pte); /* Just a page... */ - pfn = pte_pfn(*pte); - pfn_valid = true; out: spin_unlock(&vcpu->kvm->mmu_lock); if (pfn_valid) @@ -1703,7 +1929,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) * For RAS the host kernel may handle this abort. * There is no need to pass the error into the guest. */ - if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) + if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu))) return 1; if (unlikely(!is_iabt)) { @@ -1849,14 +2075,14 @@ static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data } -void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { unsigned long end = hva + PAGE_SIZE; kvm_pfn_t pfn = pte_pfn(pte); pte_t stage2_pte; if (!kvm->arch.pgd) - return; + return 0; trace_kvm_set_spte_hva(hva); @@ -1865,48 +2091,46 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) * just like a translation fault and clean the cache to the PoC. */ clean_dcache_guest_page(pfn, PAGE_SIZE); - stage2_pte = pfn_pte(pfn, PAGE_S2); + stage2_pte = kvm_pfn_pte(pfn, PAGE_S2); handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); + + return 0; } static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return stage2_pudp_test_and_clear_young(pud); + else if (pmd) return stage2_pmdp_test_and_clear_young(pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (pte_none(*pte)) - return 0; - - return stage2_ptep_test_and_clear_young(pte); + else + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { + pud_t *pud; pmd_t *pmd; pte_t *pte; - WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); - pmd = stage2_get_pmd(kvm, NULL, gpa); - if (!pmd || pmd_none(*pmd)) /* Nothing there */ + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte)) return 0; - if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + if (pud) + return kvm_s2pud_young(*pud); + else if (pmd) return pmd_young(*pmd); - - pte = pte_offset_kernel(pmd, gpa); - if (!pte_none(*pte)) /* Just a page... */ + else return pte_young(*pte); - - return 0; } int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) @@ -2152,7 +2376,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, return 0; } -void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) +void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { } diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index 9b73d3ad918a..34d08ee63747 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -104,12 +104,10 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { + struct vcpu_reset_state *reset_state; struct kvm *kvm = source_vcpu->kvm; struct kvm_vcpu *vcpu = NULL; - struct swait_queue_head *wq; unsigned long cpu_id; - unsigned long context_id; - phys_addr_t target_pc; cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; if (vcpu_mode_is_32bit(source_vcpu)) @@ -130,32 +128,30 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) return PSCI_RET_INVALID_PARAMS; } - target_pc = smccc_get_arg2(source_vcpu); - context_id = smccc_get_arg3(source_vcpu); + reset_state = &vcpu->arch.reset_state; - kvm_reset_vcpu(vcpu); - - /* Gracefully handle Thumb2 entry point */ - if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { - target_pc &= ~((phys_addr_t) 1); - vcpu_set_thumb(vcpu); - } + reset_state->pc = smccc_get_arg2(source_vcpu); /* Propagate caller endianness */ - if (kvm_vcpu_is_be(source_vcpu)) - kvm_vcpu_set_be(vcpu); + reset_state->be = kvm_vcpu_is_be(source_vcpu); - *vcpu_pc(vcpu) = target_pc; /* * NOTE: We always update r0 (or x0) because for PSCI v0.1 * the general puspose registers are undefined upon CPU_ON. */ - smccc_set_retval(vcpu, context_id, 0, 0, 0); - vcpu->arch.power_off = false; - smp_mb(); /* Make sure the above is visible */ + reset_state->r0 = smccc_get_arg3(source_vcpu); + + WRITE_ONCE(reset_state->reset, true); + kvm_make_request(KVM_REQ_VCPU_RESET, vcpu); - wq = kvm_arch_vcpu_wq(vcpu); - swake_up_one(wq); + /* + * Make sure the reset request is observed if the change to + * power_state is observed. + */ + smp_wmb(); + + vcpu->arch.power_off = false; + kvm_vcpu_wake_up(vcpu); return PSCI_RET_SUCCESS; } diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 57b3edebbb40..204d210d01c2 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -2,6 +2,7 @@ #if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KVM_H +#include <kvm/arm_arch_timer.h> #include <linux/tracepoint.h> #undef TRACE_SYSTEM @@ -26,25 +27,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(idx, exit_reason, vcpu_pc), + TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc), + TP_ARGS(ret, esr_ec, vcpu_pc), TP_STRUCT__entry( - __field( int, idx ) - __field( unsigned int, exit_reason ) + __field( int, ret ) + __field( unsigned int, esr_ec ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( - __entry->idx = idx; - __entry->exit_reason = exit_reason; + __entry->ret = ARM_EXCEPTION_CODE(ret); + __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0; __entry->vcpu_pc = vcpu_pc; ), TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", - __print_symbolic(__entry->idx, kvm_arm_exception_type), - __entry->exit_reason, - __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), + __print_symbolic(__entry->ret, kvm_arm_exception_type), + __entry->esr_ec, + __print_symbolic(__entry->esr_ec, kvm_arm_exception_class), __entry->vcpu_pc) ); @@ -262,10 +263,114 @@ TRACE_EVENT(kvm_timer_update_irq, __entry->vcpu_id, __entry->irq, __entry->level) ); +TRACE_EVENT(kvm_get_timer_map, + TP_PROTO(unsigned long vcpu_id, struct timer_map *map), + TP_ARGS(vcpu_id, map), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( int, direct_vtimer ) + __field( int, direct_ptimer ) + __field( int, emul_ptimer ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->direct_vtimer = arch_timer_ctx_index(map->direct_vtimer); + __entry->direct_ptimer = + (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1; + __entry->emul_ptimer = + (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1; + ), + + TP_printk("VCPU: %ld, dv: %d, dp: %d, ep: %d", + __entry->vcpu_id, + __entry->direct_vtimer, + __entry->direct_ptimer, + __entry->emul_ptimer) +); + +TRACE_EVENT(kvm_timer_save_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk(" CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_restore_state, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( unsigned long, ctl ) + __field( unsigned long long, cval ) + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->ctl = ctx->cnt_ctl; + __entry->cval = ctx->cnt_cval; + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d", + __entry->ctl, + __entry->cval, + __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_hrtimer_expire, + TP_PROTO(struct arch_timer_context *ctx), + TP_ARGS(ctx), + + TP_STRUCT__entry( + __field( int, timer_idx ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + ), + + TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx) +); + +TRACE_EVENT(kvm_timer_emulate, + TP_PROTO(struct arch_timer_context *ctx, bool should_fire), + TP_ARGS(ctx, should_fire), + + TP_STRUCT__entry( + __field( int, timer_idx ) + __field( bool, should_fire ) + ), + + TP_fast_assign( + __entry->timer_idx = arch_timer_ctx_index(ctx); + __entry->should_fire = should_fire; + ), + + TP_printk("arch_timer_ctx_index: %d (should_fire: %d)", + __entry->timer_idx, __entry->should_fire) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm +#define TRACE_INCLUDE_PATH ../../virt/kvm/arm #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c index 07aa900bac56..1f62f2b8065d 100644 --- a/virt/kvm/arm/vgic/vgic-debug.c +++ b/virt/kvm/arm/vgic/vgic-debug.c @@ -251,9 +251,9 @@ static int vgic_debug_show(struct seq_file *s, void *v) return 0; } - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); print_irq_state(s, irq, vcpu); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); return 0; diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index c0c0b88af1d5..3bdb31eaed64 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -64,7 +64,7 @@ void kvm_vgic_early_init(struct kvm *kvm) struct vgic_dist *dist = &kvm->arch.vgic; INIT_LIST_HEAD(&dist->lpi_list_head); - spin_lock_init(&dist->lpi_list_lock); + raw_spin_lock_init(&dist->lpi_list_lock); } /* CREATION */ @@ -171,7 +171,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) irq->intid = i + VGIC_NR_PRIVATE_IRQS; INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); + raw_spin_lock_init(&irq->irq_lock); irq->vcpu = NULL; irq->target_vcpu = vcpu0; kref_init(&irq->refcount); @@ -206,7 +206,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) vgic_cpu->sgi_iodev.base_addr = VGIC_ADDR_UNDEF; INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - spin_lock_init(&vgic_cpu->ap_list_lock); + raw_spin_lock_init(&vgic_cpu->ap_list_lock); /* * Enable and configure all SGIs to be edge-triggered and @@ -216,7 +216,7 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); + raw_spin_lock_init(&irq->irq_lock); irq->intid = i; irq->vcpu = NULL; irq->target_vcpu = vcpu; @@ -231,13 +231,6 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) irq->config = VGIC_CONFIG_LEVEL; } - /* - * GICv3 can only be created via the KVM_DEVICE_CREATE API and - * so we always know the emulation type at this point as it's - * either explicitly configured as GICv3, or explicitly - * configured as GICv2, or not configured yet which also - * implies GICv2. - */ if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) irq->group = 1; else @@ -281,7 +274,7 @@ int vgic_init(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; struct kvm_vcpu *vcpu; - int ret = 0, i; + int ret = 0, i, idx; if (vgic_initialized(kvm)) return 0; @@ -298,6 +291,19 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; + /* Initialize groups on CPUs created before the VGIC type was known */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + irq->group = 1; + else + irq->group = 0; + } + } + if (vgic_has_its(kvm)) { ret = vgic_v4_init(kvm); if (ret) diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index eb2a390a6c86..44ceaccb18cf 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -65,7 +65,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, INIT_LIST_HEAD(&irq->lpi_list); INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); + raw_spin_lock_init(&irq->irq_lock); irq->config = VGIC_CONFIG_EDGE; kref_init(&irq->refcount); @@ -73,7 +73,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, irq->target_vcpu = vcpu; irq->group = 1; - spin_lock_irqsave(&dist->lpi_list_lock, flags); + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); /* * There could be a race with another vgic_add_lpi(), so we need to @@ -101,7 +101,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid, dist->lpi_list_count++; out_unlock: - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); /* * We "cache" the configuration table entries in our struct vgic_irq's. @@ -287,7 +287,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, if (ret) return ret; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { irq->priority = LPI_PROP_PRIORITY(prop); @@ -299,7 +299,7 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, } } - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); if (irq->hw) return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); @@ -332,7 +332,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) if (!intids) return -ENOMEM; - spin_lock_irqsave(&dist->lpi_list_lock, flags); + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { if (i == irq_count) break; @@ -341,7 +341,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr) continue; intids[i++] = irq->intid; } - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); *intid_ptr = intids; return i; @@ -352,9 +352,9 @@ static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) int ret = 0; unsigned long flags; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->target_vcpu = vcpu; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); if (irq->hw) { struct its_vlpi_map map; @@ -455,7 +455,7 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) } irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = pendmask & (1U << bit_nr); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); @@ -612,7 +612,7 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, return irq_set_irqchip_state(irq->host_irq, IRQCHIP_STATE_PENDING, true); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; vgic_queue_irq_unlock(kvm, irq, flags); @@ -754,8 +754,9 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, u64 indirect_ptr, type = GITS_BASER_TYPE(baser); phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser); int esz = GITS_BASER_ENTRY_SIZE(baser); - int index; + int index, idx; gfn_t gfn; + bool ret; switch (type) { case GITS_BASER_TYPE_DEVICE: @@ -782,7 +783,8 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (eaddr) *eaddr = addr; - return kvm_is_visible_gfn(its->dev->kvm, gfn); + + goto out; } /* calculate and check the index into the 1st level */ @@ -812,7 +814,12 @@ static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id, if (eaddr) *eaddr = indirect_ptr; - return kvm_is_visible_gfn(its->dev->kvm, gfn); + +out: + idx = srcu_read_lock(&its->dev->kvm->srcu); + ret = kvm_is_visible_gfn(its->dev->kvm, gfn); + srcu_read_unlock(&its->dev->kvm->srcu, idx); + return ret; } static int vgic_its_alloc_collection(struct vgic_its *its, @@ -1729,8 +1736,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev) kfree(its); } -int vgic_its_has_attr_regs(struct kvm_device *dev, - struct kvm_device_attr *attr) +static int vgic_its_has_attr_regs(struct kvm_device *dev, + struct kvm_device_attr *attr) { const struct vgic_register_region *region; gpa_t offset = attr->attr; @@ -1750,9 +1757,9 @@ int vgic_its_has_attr_regs(struct kvm_device *dev, return 0; } -int vgic_its_attr_regs_access(struct kvm_device *dev, - struct kvm_device_attr *attr, - u64 *reg, bool is_write) +static int vgic_its_attr_regs_access(struct kvm_device *dev, + struct kvm_device_attr *attr, + u64 *reg, bool is_write) { const struct vgic_register_region *region; struct vgic_its *its; @@ -1919,7 +1926,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest(kvm, gpa, &val, ite_esz); + return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); } /** @@ -2066,7 +2073,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest(kvm, ptr, &val, dte_esz); + return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); } /** @@ -2246,7 +2253,7 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest(its->dev->kvm, gpa, &val, esz); + return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); } static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) @@ -2317,7 +2324,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) */ val = 0; BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest(its->dev->kvm, gpa, &val, cte_esz); + ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); return ret; } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index 738b65d2d0e7..b535fffc7400 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -147,7 +147,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; irq->source |= 1U << source_vcpu->vcpu_id; @@ -191,13 +191,13 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); int target; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->targets = (val >> (i * 8)) & cpu_mask; target = irq->targets ? __ffs(irq->targets) : 0; irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -230,13 +230,13 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->source &= ~((val >> (i * 8)) & 0xff); if (!irq->source) irq->pending_latch = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -252,7 +252,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->source |= (val >> (i * 8)) & 0xff; @@ -260,7 +260,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } else { - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } vgic_put_irq(vcpu->kvm, irq); } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index b3d1f0985117..9f4843fe9cda 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -169,13 +169,13 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, if (!irq) return; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); /* We only care about and preserve Aff0, Aff1 and Aff2. */ irq->mpidr = val & GENMASK(23, 0); irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -200,6 +200,9 @@ static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu, vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS; + if (was_enabled && !vgic_cpu->lpis_enabled) + vgic_flush_pending_lpis(vcpu); + if (!was_enabled && vgic_cpu->lpis_enabled) vgic_enable_lpis(vcpu); } @@ -281,7 +284,7 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i, &val)) { /* * pending_latch is set irrespective of irq type @@ -292,7 +295,7 @@ static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } else { irq->pending_latch = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } vgic_put_irq(vcpu->kvm, irq); @@ -957,7 +960,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); /* * An access targetting Group0 SGIs can only generate @@ -968,7 +971,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1) irq->pending_latch = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } else { - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } vgic_put_irq(vcpu->kvm, irq); diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index f56ff1cf52ec..7de42fba05b5 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -77,7 +77,7 @@ void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr, for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->group = !!(val & BIT(i)); vgic_queue_irq_unlock(vcpu->kvm, irq, flags); @@ -120,7 +120,7 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = true; vgic_queue_irq_unlock(vcpu->kvm, irq, flags); @@ -139,11 +139,11 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -160,10 +160,10 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); unsigned long flags; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq_is_pending(irq)) value |= (1U << i); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -215,7 +215,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw) vgic_hw_irq_spending(vcpu, irq, is_uaccess); else @@ -262,14 +262,14 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw) vgic_hw_irq_cpending(vcpu, irq, is_uaccess); else irq->pending_latch = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -311,44 +311,38 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, unsigned long flags; struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); - spin_lock_irqsave(&irq->irq_lock, flags); - - /* - * If this virtual IRQ was written into a list register, we - * have to make sure the CPU that runs the VCPU thread has - * synced back the LR state to the struct vgic_irq. - * - * As long as the conditions below are true, we know the VCPU thread - * may be on its way back from the guest (we kicked the VCPU thread in - * vgic_change_active_prepare) and still has to sync back this IRQ, - * so we release and re-acquire the spin_lock to let the other thread - * sync back the IRQ. - * - * When accessing VGIC state from user space, requester_vcpu is - * NULL, which is fine, because we guarantee that no VCPUs are running - * when accessing VGIC state from user space so irq->vcpu->cpu is - * always -1. - */ - while (irq->vcpu && /* IRQ may have state in an LR somewhere */ - irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ - irq->vcpu->cpu != -1) /* VCPU thread is running */ - cond_resched_lock(&irq->irq_lock); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->hw) { vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); } else { u32 model = vcpu->kvm->arch.vgic.vgic_model; + u8 active_source; irq->active = active; + + /* + * The GICv2 architecture indicates that the source CPUID for + * an SGI should be provided during an EOI which implies that + * the active state is stored somewhere, but at the same time + * this state is not architecturally exposed anywhere and we + * have no way of knowing the right source. + * + * This may lead to a VCPU not being able to receive + * additional instances of a particular SGI after migration + * for a GICv2 VM on some GIC implementations. Oh well. + */ + active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0; + if (model == KVM_DEV_TYPE_ARM_VGIC_V2 && active && vgic_irq_is_sgi(irq->intid)) - irq->active_source = requester_vcpu->vcpu_id; + irq->active_source = active_source; } if (irq->active) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); } /* @@ -368,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, */ static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_halt_guest(vcpu->kvm); } /* See vgic_change_active_prepare */ static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid) { - if (intid > VGIC_NR_PRIVATE_IRQS) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 || + intid > VGIC_NR_PRIVATE_IRQS) kvm_arm_resume_guest(vcpu->kvm); } @@ -489,10 +485,10 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); /* Narrow the priority range to what we actually support */ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -538,14 +534,14 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, continue; irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i * 2 + 1, &val)) irq->config = VGIC_CONFIG_EDGE; else irq->config = VGIC_CONFIG_LEVEL; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -594,12 +590,12 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, * restore irq config before line level. */ new_level = !!(val & (1U << i)); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->line_level = new_level; if (new_level) vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 69b892abd7dc..d91a8938aa7c 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -84,7 +84,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - spin_lock(&irq->irq_lock); + raw_spin_lock(&irq->irq_lock); /* Always preserve the active bit */ irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -127,7 +127,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) vgic_irq_set_phys_active(irq, false); } - spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 9c0dd234ebe8..9f87e58dbd4a 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -76,7 +76,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (!irq) /* An LPI could have been unmapped. */ continue; - spin_lock(&irq->irq_lock); + raw_spin_lock(&irq->irq_lock); /* Always preserve the active bit */ irq->active = !!(val & ICH_LR_ACTIVE_BIT); @@ -119,7 +119,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) vgic_irq_set_phys_active(irq, false); } - spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } @@ -347,9 +347,9 @@ retry: status = val & (1 << bit_nr); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->target_vcpu != vcpu) { - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); goto retry; } irq->pending_latch = status; @@ -358,7 +358,7 @@ retry: if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); - ret = kvm_write_guest(kvm, ptr, &val, 1); + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } @@ -409,7 +409,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) else val &= ~(1 << bit_nr); - ret = kvm_write_guest(kvm, ptr, &val, 1); + ret = kvm_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } @@ -589,7 +589,7 @@ early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); */ int vgic_v3_probe(const struct gic_kvm_info *info) { - u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2); + u32 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_ich_vtr_el2); int ret; /* @@ -679,7 +679,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; if (likely(cpu_if->vgic_sre)) - cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); + cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr); kvm_call_hyp(__vgic_v3_save_aprs, vcpu); diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7cfdfbc910e0..191deccf60bf 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -54,11 +54,11 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * When taking more than one ap_list_lock at the same time, always take the * lowest numbered VCPU's ap_list_lock first, so: * vcpuX->vcpu_id < vcpuY->vcpu_id: - * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); - * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); + * raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); + * raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); * * Since the VGIC must support injecting virtual interrupts from ISRs, we have - * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer + * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer * spinlocks for any lock that may be taken while injecting an interrupt. */ @@ -72,7 +72,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) struct vgic_irq *irq = NULL; unsigned long flags; - spin_lock_irqsave(&dist->lpi_list_lock, flags); + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { if (irq->intid != intid) @@ -88,7 +88,7 @@ static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid) irq = NULL; out_unlock: - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); return irq; } @@ -103,13 +103,13 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, { /* SGIs and PPIs */ if (intid <= VGIC_MAX_PRIVATE) { - intid = array_index_nospec(intid, VGIC_MAX_PRIVATE); + intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1); return &vcpu->arch.vgic_cpu.private_irqs[intid]; } /* SPIs */ - if (intid <= VGIC_MAX_SPI) { - intid = array_index_nospec(intid, VGIC_MAX_SPI); + if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) { + intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS); return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS]; } @@ -138,19 +138,40 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) if (irq->intid < VGIC_MIN_LPI) return; - spin_lock_irqsave(&dist->lpi_list_lock, flags); + raw_spin_lock_irqsave(&dist->lpi_list_lock, flags); if (!kref_put(&irq->refcount, vgic_irq_release)) { - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); return; }; list_del(&irq->lpi_list); dist->lpi_list_count--; - spin_unlock_irqrestore(&dist->lpi_list_lock, flags); + raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags); kfree(irq); } +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu) +{ + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_irq *irq, *tmp; + unsigned long flags; + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + + list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { + if (irq->intid >= VGIC_MIN_LPI) { + raw_spin_lock(&irq->irq_lock); + list_del(&irq->ap_list); + irq->vcpu = NULL; + raw_spin_unlock(&irq->irq_lock); + vgic_put_irq(vcpu->kvm, irq); + } + } + + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); +} + void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) { WARN_ON(irq_set_irqchip_state(irq->host_irq, @@ -196,7 +217,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) */ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq) { - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + lockdep_assert_held(&irq->irq_lock); /* If the interrupt is active, it must stay on the current vcpu */ if (irq->active) @@ -244,8 +265,8 @@ static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) bool penda, pendb; int ret; - spin_lock(&irqa->irq_lock); - spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&irqa->irq_lock); + raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING); if (irqa->active || irqb->active) { ret = (int)irqb->active - (int)irqa->active; @@ -263,8 +284,8 @@ static int vgic_irq_cmp(void *priv, struct list_head *a, struct list_head *b) /* Both pending and enabled, sort by priority */ ret = irqa->priority - irqb->priority; out: - spin_unlock(&irqb->irq_lock); - spin_unlock(&irqa->irq_lock); + raw_spin_unlock(&irqb->irq_lock); + raw_spin_unlock(&irqa->irq_lock); return ret; } @@ -273,7 +294,7 @@ static void vgic_sort_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + lockdep_assert_held(&vgic_cpu->ap_list_lock); list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp); } @@ -311,7 +332,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, { struct kvm_vcpu *vcpu; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + lockdep_assert_held(&irq->irq_lock); retry: vcpu = vgic_target_oracle(irq); @@ -325,7 +346,7 @@ retry: * not need to be inserted into an ap_list and there is also * no more work for us to do. */ - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); /* * We have to kick the VCPU here, because we could be @@ -347,12 +368,12 @@ retry: * We must unlock the irq lock to take the ap_list_lock where * we are going to insert this new pending interrupt. */ - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); /* someone can do stuff here, which we re-check below */ - spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - spin_lock(&irq->irq_lock); + raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + raw_spin_lock(&irq->irq_lock); /* * Did something change behind our backs? @@ -367,10 +388,11 @@ retry: */ if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { - spin_unlock(&irq->irq_lock); - spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, + flags); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); goto retry; } @@ -382,8 +404,8 @@ retry: list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head); irq->vcpu = vcpu; - spin_unlock(&irq->irq_lock); - spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); kvm_vcpu_kick(vcpu); @@ -430,11 +452,11 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, if (!irq) return -EINVAL; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (!vgic_validate_injection(irq, level, owner)) { /* Nothing to see here, move along... */ - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); return 0; } @@ -494,9 +516,9 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, BUG_ON(!irq); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); return ret; @@ -519,11 +541,11 @@ void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid) if (!irq->hw) goto out; - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); irq->active = false; irq->pending_latch = false; irq->line_level = false; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); out: vgic_put_irq(vcpu->kvm, irq); } @@ -539,9 +561,9 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); BUG_ON(!irq); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); kvm_vgic_unmap_irq(irq); - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); return 0; @@ -571,12 +593,12 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) return -EINVAL; irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); if (irq->owner && irq->owner != owner) ret = -EEXIST; else irq->owner = owner; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); return ret; } @@ -597,13 +619,13 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); retry: - spin_lock(&vgic_cpu->ap_list_lock); + raw_spin_lock(&vgic_cpu->ap_list_lock); list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; bool target_vcpu_needs_kick = false; - spin_lock(&irq->irq_lock); + raw_spin_lock(&irq->irq_lock); BUG_ON(vcpu != irq->vcpu); @@ -616,7 +638,7 @@ retry: */ list_del(&irq->ap_list); irq->vcpu = NULL; - spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); /* * This vgic_put_irq call matches the @@ -631,14 +653,14 @@ retry: if (target_vcpu == vcpu) { /* We're on the right CPU */ - spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); continue; } /* This interrupt looks like it has to be migrated. */ - spin_unlock(&irq->irq_lock); - spin_unlock(&vgic_cpu->ap_list_lock); + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vgic_cpu->ap_list_lock); /* * Ensure locking order by always locking the smallest @@ -652,10 +674,10 @@ retry: vcpuB = vcpu; } - spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); - spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, - SINGLE_DEPTH_NESTING); - spin_lock(&irq->irq_lock); + raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); + raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, + SINGLE_DEPTH_NESTING); + raw_spin_lock(&irq->irq_lock); /* * If the affinity has been preserved, move the @@ -675,9 +697,9 @@ retry: target_vcpu_needs_kick = true; } - spin_unlock(&irq->irq_lock); - spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); - spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); + raw_spin_unlock(&irq->irq_lock); + raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); + raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); if (target_vcpu_needs_kick) { kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu); @@ -687,7 +709,7 @@ retry: goto retry; } - spin_unlock(&vgic_cpu->ap_list_lock); + raw_spin_unlock(&vgic_cpu->ap_list_lock); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) @@ -702,7 +724,7 @@ static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) static inline void vgic_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) { - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&irq->irq_lock)); + lockdep_assert_held(&irq->irq_lock); if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_populate_lr(vcpu, irq, lr); @@ -736,15 +758,15 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, *multi_sgi = false; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + lockdep_assert_held(&vgic_cpu->ap_list_lock); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { int w; - spin_lock(&irq->irq_lock); + raw_spin_lock(&irq->irq_lock); /* GICv2 SGIs can count for more than one... */ w = vgic_irq_get_lr_count(irq); - spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); count += w; *multi_sgi |= (w > 1); @@ -761,7 +783,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) bool multi_sgi; u8 prio = 0xff; - DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); + lockdep_assert_held(&vgic_cpu->ap_list_lock); count = compute_ap_list_depth(vcpu, &multi_sgi); if (count > kvm_vgic_global_state.nr_lr || multi_sgi) @@ -770,7 +792,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) count = 0; list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); + raw_spin_lock(&irq->irq_lock); /* * If we have multi-SGIs in the pipeline, we need to @@ -780,7 +802,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) * the AP list has been sorted already. */ if (multi_sgi && irq->priority > prio) { - spin_unlock(&irq->irq_lock); + _raw_spin_unlock(&irq->irq_lock); break; } @@ -791,7 +813,7 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) prio = irq->priority; } - spin_unlock(&irq->irq_lock); + raw_spin_unlock(&irq->irq_lock); if (count == kvm_vgic_global_state.nr_lr) { if (!list_is_last(&irq->ap_list, @@ -866,15 +888,21 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) * either observe the new interrupt before or after doing this check, * and introducing additional synchronization mechanism doesn't change * this. + * + * Note that we still need to go through the whole thing if anything + * can be directly injected (GICv4). */ - if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) && + !vgic_supports_direct_msis(vcpu->kvm)) return; DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); - spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); - vgic_flush_lr_state(vcpu); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) { + raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + vgic_flush_lr_state(vcpu); + raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + } if (can_access_vgic_from_kernel()) vgic_restore_state(vcpu); @@ -908,6 +936,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) struct vgic_irq *irq; bool pending = false; unsigned long flags; + struct vgic_vmcr vmcr; if (!vcpu->kvm->arch.vgic.enabled) return false; @@ -915,18 +944,22 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) return true; - spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); + vgic_get_vmcr(vcpu, &vmcr); + + raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { - spin_lock(&irq->irq_lock); - pending = irq_is_pending(irq) && irq->enabled; - spin_unlock(&irq->irq_lock); + raw_spin_lock(&irq->irq_lock); + pending = irq_is_pending(irq) && irq->enabled && + !irq->active && + irq->priority < vmcr.pmr; + raw_spin_unlock(&irq->irq_lock); if (pending) break; } - spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); + raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); return pending; } @@ -958,11 +991,10 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) return false; irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); - spin_lock_irqsave(&irq->irq_lock, flags); + raw_spin_lock_irqsave(&irq->irq_lock, flags); map_is_active = irq->hw && irq->active; - spin_unlock_irqrestore(&irq->irq_lock, flags); + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); return map_is_active; } - diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index a90024718ca4..abeeffabc456 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -238,6 +238,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); +void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu); int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi); int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 23c2519c5b32..110cbe3f74f8 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -82,7 +82,7 @@ static void async_pf_execute(struct work_struct *work) might_sleep(); /* - * This work is run asynchromously to the task which owns + * This work is run asynchronously to the task which owns * mm and might be done in another context, so we must * access remotely. */ diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 6855cce3e528..5294abb3f178 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -144,7 +144,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm, if (zone->pio != 1 && zone->pio != 0) return -EINVAL; - dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL); + dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), + GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b20b751286fc..001aeda4c154 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -214,9 +214,9 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) if (flags & EPOLLHUP) { /* The eventfd is closing, detach from KVM */ - unsigned long flags; + unsigned long iflags; - spin_lock_irqsave(&kvm->irqfds.lock, flags); + spin_lock_irqsave(&kvm->irqfds.lock, iflags); /* * We must check if someone deactivated the irqfd before @@ -230,7 +230,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) if (irqfd_is_active(irqfd)) irqfd_deactivate(irqfd); - spin_unlock_irqrestore(&kvm->irqfds.lock, flags); + spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); } return 0; @@ -297,7 +297,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) if (!kvm_arch_intc_initialized(kvm)) return -EAGAIN; - irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); + irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); if (!irqfd) return -ENOMEM; @@ -345,7 +345,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) } if (!irqfd->resampler) { - resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); + resampler = kzalloc(sizeof(*resampler), + GFP_KERNEL_ACCOUNT); if (!resampler) { ret = -ENOMEM; mutex_unlock(&kvm->irqfds.resampler_lock); @@ -797,7 +798,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, if (IS_ERR(eventfd)) return PTR_ERR(eventfd); - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); if (!p) { ret = -ENOMEM; goto fail; diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index b1286c4e0712..79e59e4fa3dc 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -144,18 +144,19 @@ static int setup_routing_entry(struct kvm *kvm, { struct kvm_kernel_irq_routing_entry *ei; int r; + u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ - hlist_for_each_entry(ei, &rt->map[ue->gsi], link) + hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; - e->gsi = ue->gsi; + e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) @@ -196,7 +197,7 @@ int kvm_set_irq_routing(struct kvm *kvm, nr_rt_entries += 1; new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; @@ -208,7 +209,7 @@ int kvm_set_irq_routing(struct kvm *kvm, for (i = 0; i < nr; ++i) { r = -ENOMEM; - e = kzalloc(sizeof(*e), GFP_KERNEL); + e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2679e476b6c3..a704d1f9bd96 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -81,6 +81,11 @@ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +/* The start value to grow halt_poll_ns from */ +unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ +module_param(halt_poll_ns_grow_start, uint, 0644); +EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); + /* Default resets per-vcpu halt_poll_ns . */ unsigned int halt_poll_ns_shrink; module_param(halt_poll_ns_shrink, uint, 0644); @@ -354,16 +359,16 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); kvm->mmu_notifier_seq++; - kvm_set_spte_hva(kvm, address, pte); + + if (kvm_set_spte_hva(kvm, address, pte)) + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - bool blockable) + const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; @@ -377,7 +382,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, * count is also read inside the mmu_lock critical section. */ kvm->mmu_notifier_count++; - need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); + need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); need_tlb_flush |= kvm->tlbs_dirty; /* we've to flush the tlb before the pages can be freed */ if (need_tlb_flush) @@ -385,7 +390,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, spin_unlock(&kvm->mmu_lock); - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable); + ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, + range->end, range->blockable); srcu_read_unlock(&kvm->srcu, idx); @@ -393,9 +399,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end) + const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); @@ -526,7 +530,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void) int i; struct kvm_memslots *slots; - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); if (!slots) return NULL; @@ -602,12 +606,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, sizeof(*kvm->debugfs_stat_data), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!kvm->debugfs_stat_data) return -ENOMEM; for (p = debugfs_entries; p->name; p++) { - stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); + stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) return -ENOMEM; @@ -657,12 +661,8 @@ static struct kvm *kvm_create_vm(unsigned long type) struct kvm_memslots *slots = kvm_alloc_memslots(); if (!slots) goto out_err_no_srcu; - /* - * Generations must be different for each address space. - * Init kvm generation close to the maximum to easily test the - * code of handling generation number wrap-around. - */ - slots->generation = i * 2 - 150; + /* Generations must be different for each address space. */ + slots->generation = i; rcu_assign_pointer(kvm->memslots[i], slots); } @@ -672,7 +672,7 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { rcu_assign_pointer(kvm->buses[i], - kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err; } @@ -790,7 +790,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) { unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); - memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); + memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM; @@ -875,31 +875,34 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, int as_id, struct kvm_memslots *slots) { struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); + u64 gen = old_memslots->generation; - /* - * Set the low bit in the generation, which disables SPTE caching - * until the end of synchronize_srcu_expedited. - */ - WARN_ON(old_memslots->generation & 1); - slots->generation = old_memslots->generation + 1; + WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); + slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; rcu_assign_pointer(kvm->memslots[as_id], slots); synchronize_srcu_expedited(&kvm->srcu); /* - * Increment the new memslot generation a second time. This prevents - * vm exits that race with memslot updates from caching a memslot - * generation that will (potentially) be valid forever. - * + * Increment the new memslot generation a second time, dropping the + * update in-progress flag and incrementing then generation based on + * the number of address spaces. This provides a unique and easily + * identifiable generation number while the memslots are in flux. + */ + gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + + /* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address - * space 0 will use generations 0, 4, 8, ... while * address space 1 will - * use generations 2, 6, 10, 14, ... + * space 0 will use generations 0, 2, 4, ... while address space 1 will + * use generations 1, 3, 5, ... */ - slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; + gen += KVM_ADDRESS_SPACE_NUM; - kvm_arch_memslots_updated(kvm, slots); + kvm_arch_memslots_updated(kvm, gen); + + slots->generation = gen; return old_memslots; } @@ -940,8 +943,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* We can read the guest memory with __xxx_user() later on. */ if ((id < KVM_USER_MEM_SLOTS) && ((mem->userspace_addr & (PAGE_SIZE - 1)) || - !access_ok(VERIFY_WRITE, - (void __user *)(unsigned long)mem->userspace_addr, + !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size))) goto out; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) @@ -1020,7 +1022,7 @@ int __kvm_set_memory_region(struct kvm *kvm, goto out_free; } - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); if (!slots) goto out_free; memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); @@ -1133,7 +1135,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages - * are dirty write protect them for next write. + * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: flag set if any page is dirty @@ -1154,7 +1156,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log); * */ int kvm_get_dirty_log_protect(struct kvm *kvm, - struct kvm_dirty_log *log, bool *is_dirty) + struct kvm_dirty_log *log, bool *flush) { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; @@ -1176,37 +1178,118 @@ int kvm_get_dirty_log_protect(struct kvm *kvm, return -ENOENT; n = kvm_dirty_bitmap_bytes(memslot); + *flush = false; + if (kvm->manual_dirty_log_protect) { + /* + * Unlike kvm_get_dirty_log, we always return false in *flush, + * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There + * is some code duplication between this function and + * kvm_get_dirty_log, but hopefully all architecture + * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log + * can be eliminated. + */ + dirty_bitmap_buffer = dirty_bitmap; + } else { + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + memset(dirty_bitmap_buffer, 0, n); - dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); - memset(dirty_bitmap_buffer, 0, n); - - spin_lock(&kvm->mmu_lock); - *is_dirty = false; - for (i = 0; i < n / sizeof(long); i++) { - unsigned long mask; - gfn_t offset; - - if (!dirty_bitmap[i]) - continue; + spin_lock(&kvm->mmu_lock); + for (i = 0; i < n / sizeof(long); i++) { + unsigned long mask; + gfn_t offset; - *is_dirty = true; + if (!dirty_bitmap[i]) + continue; - mask = xchg(&dirty_bitmap[i], 0); - dirty_bitmap_buffer[i] = mask; + *flush = true; + mask = xchg(&dirty_bitmap[i], 0); + dirty_bitmap_buffer[i] = mask; - if (mask) { offset = i * BITS_PER_LONG; kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); } + spin_unlock(&kvm->mmu_lock); } - spin_unlock(&kvm->mmu_lock); if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0; } EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); + +/** + * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap + * and reenable dirty page tracking for the corresponding pages. + * @kvm: pointer to kvm instance + * @log: slot id and address from which to fetch the bitmap of dirty pages + */ +int kvm_clear_dirty_log_protect(struct kvm *kvm, + struct kvm_clear_dirty_log *log, bool *flush) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int as_id, id; + gfn_t offset; + unsigned long i, n; + unsigned long *dirty_bitmap; + unsigned long *dirty_bitmap_buffer; + + as_id = log->slot >> 16; + id = (u16)log->slot; + if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + return -EINVAL; + + if (log->first_page & 63) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); + memslot = id_to_memslot(slots, id); + + dirty_bitmap = memslot->dirty_bitmap; + if (!dirty_bitmap) + return -ENOENT; + + n = kvm_dirty_bitmap_bytes(memslot); + + if (log->first_page > memslot->npages || + log->num_pages > memslot->npages - log->first_page || + (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) + return -EINVAL; + + *flush = false; + dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); + if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) + return -EFAULT; + + spin_lock(&kvm->mmu_lock); + for (offset = log->first_page, + i = offset / BITS_PER_LONG, n = log->num_pages / BITS_PER_LONG; n--; + i++, offset += BITS_PER_LONG) { + unsigned long mask = *dirty_bitmap_buffer++; + atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; + if (!mask) + continue; + + mask &= atomic_long_fetch_andnot(mask, p); + + /* + * mask contains the bits that really have been cleared. This + * never includes any bits beyond the length of the memslot (if + * the length is not aligned to 64 pages), therefore it is not + * a problem if userspace sets them in log->dirty_bitmap. + */ + if (mask) { + *flush = true; + kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, + offset, mask); + } + } + spin_unlock(&kvm->mmu_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); #endif bool kvm_largepages_enabled(void) @@ -1928,32 +2011,33 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; gfn_t nr_pages_needed = end_gfn - start_gfn + 1; gfn_t nr_pages_avail; + int r = start_gfn <= end_gfn ? 0 : -EINVAL; ghc->gpa = gpa; ghc->generation = slots->generation; ghc->len = len; - ghc->memslot = __gfn_to_memslot(slots, start_gfn); - ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); - if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { + ghc->hva = KVM_HVA_ERR_BAD; + + /* + * If the requested region crosses two memslots, we still + * verify that the entire region is valid here. + */ + while (!r && start_gfn <= end_gfn) { + ghc->memslot = __gfn_to_memslot(slots, start_gfn); + ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, + &nr_pages_avail); + if (kvm_is_error_hva(ghc->hva)) + r = -EFAULT; + start_gfn += nr_pages_avail; + } + + /* Use the slow path for cross page reads and writes. */ + if (!r && nr_pages_needed == 1) ghc->hva += offset; - } else { - /* - * If the requested region crosses two memslots, we still - * verify that the entire region is valid here. - */ - while (start_gfn <= end_gfn) { - nr_pages_avail = 0; - ghc->memslot = __gfn_to_memslot(slots, start_gfn); - ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, - &nr_pages_avail); - if (kvm_is_error_hva(ghc->hva)) - return -EFAULT; - start_gfn += nr_pages_avail; - } - /* Use the slow path for cross page reads and writes. */ + else ghc->memslot = NULL; - } - return 0; + + return r; } int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, @@ -1965,7 +2049,8 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len) + void *data, unsigned int offset, + unsigned long len) { struct kvm_memslots *slots = kvm_memslots(kvm); int r; @@ -2103,20 +2188,23 @@ void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { - unsigned int old, val, grow; + unsigned int old, val, grow, grow_start; old = val = vcpu->halt_poll_ns; + grow_start = READ_ONCE(halt_poll_ns_grow_start); grow = READ_ONCE(halt_poll_ns_grow); - /* 10us base */ - if (val == 0 && grow) - val = 10000; - else - val *= grow; + if (!grow) + goto out; + + val *= grow; + if (val < grow_start) + val = grow_start; if (val > halt_poll_ns) val = halt_poll_ns; vcpu->halt_poll_ns = val; +out: trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); } @@ -2601,7 +2689,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_regs *kvm_regs; r = -ENOMEM; - kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); + kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); if (!kvm_regs) goto out; r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); @@ -2629,7 +2717,8 @@ out_free1: break; } case KVM_GET_SREGS: { - kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); + kvm_sregs = kzalloc(sizeof(struct kvm_sregs), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!kvm_sregs) goto out; @@ -2721,7 +2810,7 @@ out_free1: break; } case KVM_GET_FPU: { - fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); + fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!fpu) goto out; @@ -2817,6 +2906,9 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; + if (dev->kvm->mm != current->mm) + return -EIO; + switch (ioctl) { case KVM_SET_DEVICE_ATTR: return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); @@ -2886,19 +2978,21 @@ static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_device_ops *ops = NULL; struct kvm_device *dev; bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int type; int ret; if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) return -ENODEV; - ops = kvm_device_ops_table[cd->type]; + type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); + ops = kvm_device_ops_table[type]; if (ops == NULL) return -ENODEV; if (test) return 0; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); if (!dev) return -ENOMEM; @@ -2906,7 +3000,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm, dev->kvm = kvm; mutex_lock(&kvm->lock); - ret = ops->create(dev, cd->type); + ret = ops->create(dev, type); if (ret < 0) { mutex_unlock(&kvm->lock); kfree(dev); @@ -2918,8 +3012,10 @@ static int kvm_ioctl_create_device(struct kvm *kvm, if (ops->init) ops->init(dev); + kvm_get_kvm(kvm); ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); if (ret < 0) { + kvm_put_kvm(kvm); mutex_lock(&kvm->lock); list_del(&dev->vm_node); mutex_unlock(&kvm->lock); @@ -2927,7 +3023,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm, return ret; } - kvm_get_kvm(kvm); cd->fd = ret; return 0; } @@ -2948,6 +3043,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: + case KVM_CAP_ENABLE_CAP_VM: +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: +#endif return 1; #ifdef CONFIG_KVM_MMIO case KVM_CAP_COALESCED_MMIO: @@ -2971,6 +3070,28 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return kvm_vm_ioctl_check_extension(kvm, arg); } +int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + return -EINVAL; +} + +static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + switch (cap->cap) { +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT: + if (cap->flags || (cap->args[0] & ~1)) + return -EINVAL; + kvm->manual_dirty_log_protect = cap->args[0]; + return 0; +#endif + default: + return kvm_vm_ioctl_enable_cap(kvm, cap); + } +} + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2984,6 +3105,15 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_CREATE_VCPU: r = kvm_vm_ioctl_create_vcpu(kvm, arg); break; + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); + break; + } case KVM_SET_USER_MEMORY_REGION: { struct kvm_userspace_memory_region kvm_userspace_mem; @@ -3004,6 +3134,17 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } +#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT + case KVM_CLEAR_DIRTY_LOG: { + struct kvm_clear_dirty_log log; + + r = -EFAULT; + if (copy_from_user(&log, argp, sizeof(log))) + goto out; + r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); + break; + } +#endif #ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; @@ -3496,6 +3637,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } +EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, @@ -3546,7 +3688,6 @@ static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); /* kvm_io_bus_read - called under kvm->slots_lock */ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, @@ -3568,7 +3709,6 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, return r < 0 ? r : 0; } - /* Caller must hold slots_lock. */ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) @@ -3585,8 +3725,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), + GFP_KERNEL_ACCOUNT); if (!new_bus) return -ENOMEM; @@ -3631,8 +3771,8 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, if (i == bus->dev_count) return; - new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * - sizeof(struct kvm_io_range)), GFP_KERNEL); + new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), + GFP_KERNEL_ACCOUNT); if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); goto broken; @@ -3900,7 +4040,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) active = kvm_active_vms; spin_unlock(&kvm_lock); - env = kzalloc(sizeof(*env), GFP_KERNEL); + env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); if (!env) return; @@ -3915,8 +4055,8 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } add_uevent_var(env, "PID=%d", kvm->userspace_pid); - if (kvm->debugfs_dentry) { - char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); + if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { + char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); if (p) { tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); @@ -3955,7 +4095,7 @@ static int kvm_suspend(void) static void kvm_resume(void) { if (kvm_usage_count) { - WARN_ON(raw_spin_is_locked(&kvm_count_lock)); + lockdep_assert_held(&kvm_count_lock); hardware_enable_nolock(NULL); } } diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d99850c462a1..524cbd20379f 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -219,7 +219,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) } } - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL); + kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); if (!kvg) { mutex_unlock(&kv->lock); kvm_vfio_group_put_external_user(vfio_group); @@ -405,7 +405,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) if (tmp->ops == &kvm_vfio_ops) return -EBUSY; - kv = kzalloc(sizeof(*kv), GFP_KERNEL); + kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT); if (!kv) return -ENOMEM; |
