diff --git a/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt new file mode 100644 index 000000000000..38bca2835278 --- /dev/null +++ b/Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt @@ -0,0 +1,187 @@ +KVM/ARM VGIC Forwarded Physical Interrupts +========================================== + +The KVM/ARM code implements software support for the ARM Generic +Interrupt Controller's (GIC's) hardware support for virtualization by +allowing software to inject virtual interrupts to a VM, which the guest +OS sees as regular interrupts. The code is famously known as the VGIC. + +Some of these virtual interrupts, however, correspond to physical +interrupts from real physical devices. One example could be the +architected timer, which itself supports virtualization, and therefore +lets a guest OS program the hardware device directly to raise an +interrupt at some point in time. When such an interrupt is raised, the +host OS initially handles the interrupt and must somehow signal this +event as a virtual interrupt to the guest. Another example could be a +passthrough device, where the physical interrupts are initially handled +by the host, but the device driver for the device lives in the guest OS +and KVM must therefore somehow inject a virtual interrupt on behalf of +the physical one to the guest OS. + +These virtual interrupts corresponding to a physical interrupt on the +host are called forwarded physical interrupts, but are also sometimes +referred to as 'virtualized physical interrupts' and 'mapped interrupts'. + +Forwarded physical interrupts are handled slightly differently compared +to virtual interrupts generated purely by a software emulated device. + + +The HW bit +---------- +Virtual interrupts are signalled to the guest by programming the List +Registers (LRs) on the GIC before running a VCPU. The LR is programmed +with the virtual IRQ number and the state of the interrupt (Pending, +Active, or Pending+Active). When the guest ACKs and EOIs a virtual +interrupt, the LR state moves from Pending to Active, and finally to +inactive. + +The LRs include an extra bit, called the HW bit. When this bit is set, +KVM must also program an additional field in the LR, the physical IRQ +number, to link the virtual with the physical IRQ. + +When the HW bit is set, KVM must EITHER set the Pending OR the Active +bit, never both at the same time. + +Setting the HW bit causes the hardware to deactivate the physical +interrupt on the physical distributor when the guest deactivates the +corresponding virtual interrupt. + + +Forwarded Physical Interrupts Life Cycle +---------------------------------------- + +The state of forwarded physical interrupts is managed in the following way: + + - The physical interrupt is acked by the host, and becomes active on + the physical distributor (*). + - KVM sets the LR.Pending bit, because this is the only way the GICV + interface is going to present it to the guest. + - LR.Pending will stay set as long as the guest has not acked the interrupt. + - LR.Pending transitions to LR.Active on the guest read of the IAR, as + expected. + - On guest EOI, the *physical distributor* active bit gets cleared, + but the LR.Active is left untouched (set). + - KVM clears the LR on VM exits when the physical distributor + active state has been cleared. + +(*): The host handling is slightly more complicated. For some forwarded +interrupts (shared), KVM directly sets the active state on the physical +distributor before entering the guest, because the interrupt is never actually +handled on the host (see details on the timer as an example below). For other +forwarded interrupts (non-shared) the host does not deactivate the interrupt +when the host ISR completes, but leaves the interrupt active until the guest +deactivates it. Leaving the interrupt active is allowed, because Linux +configures the physical GIC with EOIMode=1, which causes EOI operations to +perform a priority drop allowing the GIC to receive other interrupts of the +default priority. + + +Forwarded Edge and Level Triggered PPIs and SPIs +------------------------------------------------ +Forwarded physical interrupts injected should always be active on the +physical distributor when injected to a guest. + +Level-triggered interrupts will keep the interrupt line to the GIC +asserted, typically until the guest programs the device to deassert the +line. This means that the interrupt will remain pending on the physical +distributor until the guest has reprogrammed the device. Since we +always run the VM with interrupts enabled on the CPU, a pending +interrupt will exit the guest as soon as we switch into the guest, +preventing the guest from ever making progress as the process repeats +over and over. Therefore, the active state on the physical distributor +must be set when entering the guest, preventing the GIC from forwarding +the pending interrupt to the CPU. As soon as the guest deactivates the +interrupt, the physical line is sampled by the hardware again and the host +takes a new interrupt if and only if the physical line is still asserted. + +Edge-triggered interrupts do not exhibit the same problem with +preventing guest execution that level-triggered interrupts do. One +option is to not use HW bit at all, and inject edge-triggered interrupts +from a physical device as pure virtual interrupts. But that would +potentially slow down handling of the interrupt in the guest, because a +physical interrupt occurring in the middle of the guest ISR would +preempt the guest for the host to handle the interrupt. Additionally, +if you configure the system to handle interrupts on a separate physical +core from that running your VCPU, you still have to interrupt the VCPU +to queue the pending state onto the LR, even though the guest won't use +this information until the guest ISR completes. Therefore, the HW +bit should always be set for forwarded edge-triggered interrupts. With +the HW bit set, the virtual interrupt is injected and additional +physical interrupts occurring before the guest deactivates the interrupt +simply mark the state on the physical distributor as Pending+Active. As +soon as the guest deactivates the interrupt, the host takes another +interrupt if and only if there was a physical interrupt between injecting +the forwarded interrupt to the guest and the guest deactivating the +interrupt. + +Consequently, whenever we schedule a VCPU with one or more LRs with the +HW bit set, the interrupt must also be active on the physical +distributor. + + +Forwarded LPIs +-------------- +LPIs, introduced in GICv3, are always edge-triggered and do not have an +active state. They become pending when a device signal them, and as +soon as they are acked by the CPU, they are inactive again. + +It therefore doesn't make sense, and is not supported, to set the HW bit +for physical LPIs that are forwarded to a VM as virtual interrupts, +typically virtual SPIs. + +For LPIs, there is no other choice than to preempt the VCPU thread if +necessary, and queue the pending state onto the LR. + + +Putting It Together: The Architected Timer +------------------------------------------ +The architected timer is a device that signals interrupts with level +triggered semantics. The timer hardware is directly accessed by VCPUs +which program the timer to fire at some point in time. Each VCPU on a +system programs the timer to fire at different times, and therefore the +hardware is multiplexed between multiple VCPUs. This is implemented by +context-switching the timer state along with each VCPU thread. + +However, this means that a scenario like the following is entirely +possible, and in fact, typical: + +1. KVM runs the VCPU +2. The guest programs the time to fire in T+100 +3. The guest is idle and calls WFI (wait-for-interrupts) +4. The hardware traps to the host +5. KVM stores the timer state to memory and disables the hardware timer +6. KVM schedules a soft timer to fire in T+(100 - time since step 2) +7. KVM puts the VCPU thread to sleep (on a waitqueue) +8. The soft timer fires, waking up the VCPU thread +9. KVM reprograms the timer hardware with the VCPU's values +10. KVM marks the timer interrupt as active on the physical distributor +11. KVM injects a forwarded physical interrupt to the guest +12. KVM runs the VCPU + +Notice that KVM injects a forwarded physical interrupt in step 11 without +the corresponding interrupt having actually fired on the host. That is +exactly why we mark the timer interrupt as active in step 10, because +the active state on the physical distributor is part of the state +belonging to the timer hardware, which is context-switched along with +the VCPU thread. + +If the guest does not idle because it is busy, the flow looks like this +instead: + +1. KVM runs the VCPU +2. The guest programs the time to fire in T+100 +4. At T+100 the timer fires and a physical IRQ causes the VM to exit + (note that this initially only traps to EL2 and does not run the host ISR + until KVM has returned to the host). +5. With interrupts still disabled on the CPU coming back from the guest, KVM + stores the virtual timer state to memory and disables the virtual hw timer. +6. KVM looks at the timer state (in memory) and injects a forwarded physical + interrupt because it concludes the timer has expired. +7. KVM marks the timer interrupt as active on the physical distributor +7. KVM enables the timer, enables interrupts, and runs the VCPU + +Notice that again the forwarded physical interrupt is injected to the +guest without having actually been handled on the host. In this case it +is because the physical interrupt is never actually seen by the host because the +timer is disabled upon guest return, and the virtual forwarded interrupt is +injected on the KVM guest entry path. diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 3fb905429e8a..59541d49e15c 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -44,28 +44,29 @@ Groups: Attributes: The attr field of kvm_device_attr encodes two values: bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | cpu id | offset | + values: | reserved | vcpu_index | offset | All distributor regs are (rw, 32-bit) The offset is relative to the "Distributor base address" as defined in the GICv2 specs. Getting or setting such a register has the same effect as - reading or writing the register on the actual hardware from the cpu - specified with cpu id field. Note that most distributor fields are not - banked, but return the same value regardless of the cpu id used to access - the register. + reading or writing the register on the actual hardware from the cpu whose + index is specified with the vcpu_index field. Note that most distributor + fields are not banked, but return the same value regardless of the + vcpu_index used to access the register. Limitations: - Priorities are not implemented, and registers are RAZ/WI - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. Errors: - -ENODEV: Getting or setting this register is not yet supported + -ENXIO: Getting or setting this register is not yet supported -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied KVM_DEV_ARM_VGIC_GRP_CPU_REGS Attributes: The attr field of kvm_device_attr encodes two values: bits: | 63 .... 40 | 39 .. 32 | 31 .... 0 | - values: | reserved | cpu id | offset | + values: | reserved | vcpu_index | offset | All CPU interface regs are (rw, 32-bit) @@ -91,8 +92,9 @@ Groups: - Priorities are not implemented, and registers are RAZ/WI - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. Errors: - -ENODEV: Getting or setting this register is not yet supported + -ENXIO: Getting or setting this register is not yet supported -EBUSY: One or more VCPUs are running + -EINVAL: Invalid vcpu_index supplied KVM_DEV_ARM_VGIC_GRP_NR_IRQS Attributes: diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h index d995821f1698..dc641ddf0784 100644 --- a/arch/arm/include/asm/kvm_arm.h +++ b/arch/arm/include/asm/kvm_arm.h @@ -218,4 +218,24 @@ #define HSR_DABT_CM (1U << 8) #define HSR_DABT_EA (1U << 9) +#define kvm_arm_exception_type \ + {0, "RESET" }, \ + {1, "UNDEFINED" }, \ + {2, "SOFTWARE" }, \ + {3, "PREF_ABORT" }, \ + {4, "DATA_ABORT" }, \ + {5, "IRQ" }, \ + {6, "FIQ" }, \ + {7, "HVC" } + +#define HSRECN(x) { HSR_EC_##x, #x } + +#define kvm_arm_exception_class \ + HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \ + HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \ + HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \ + HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \ + HSRECN(DABT), HSRECN(DABT_HYP) + + #endif /* __ARM_KVM_ARM_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index c4072d9f32c7..6692982c9b57 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -126,7 +126,10 @@ struct kvm_vcpu_arch { * here. */ - /* Don't run the guest on this vcpu */ + /* vcpu power-off state */ + bool power_off; + + /* Don't run the guest (internal implementation need) */ bool pause; /* IO related fields */ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 210eccadb69a..95a000515e43 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM depends on MMU && OF select PREEMPT_NOTIFIERS select ANON_INODES + select ARM_GIC select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO @@ -45,4 +46,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +source drivers/vhost/Kconfig + endif # VIRTUALIZATION diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index dc017adfddc8..eab83b2435b8 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvm_timer_should_fire(vcpu); } +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + kvm_timer_schedule(vcpu); +} + +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + kvm_timer_unschedule(vcpu); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { /* Force users to call KVM_ARM_VCPU_INIT */ @@ -308,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.pause) + if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; @@ -321,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.pause = false; + vcpu->arch.power_off = false; break; case KVM_MP_STATE_STOPPED: - vcpu->arch.pause = true; + vcpu->arch.power_off = true; break; default: return -EINVAL; @@ -342,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, */ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v); + return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v)) + && !v->arch.power_off && !v->arch.pause); } /* Just ensure a guest exit from a particular CPU */ @@ -468,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } -static void vcpu_pause(struct kvm_vcpu *vcpu) +static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused; +static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused; + +static void kvm_arm_halt_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + vcpu->arch.pause = true; + force_vm_exit(cpu_all_mask); +} + +static void kvm_arm_resume_guest(struct kvm *kvm) +{ + int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); + + vcpu->arch.pause = false; + wake_up_interruptible(wq); + } +} + +static void vcpu_sleep(struct kvm_vcpu *vcpu) { wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu); - wait_event_interruptible(*wq, !vcpu->arch.pause); + wait_event_interruptible(*wq, ((!vcpu->arch.power_off) && + (!vcpu->arch.pause))); } static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu) @@ -522,8 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) update_vttbr(vcpu->kvm); - if (vcpu->arch.pause) - vcpu_pause(vcpu); + if (vcpu->arch.power_off || vcpu->arch.pause) + vcpu_sleep(vcpu); /* * Disarming the background timer must be done in a @@ -549,11 +587,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) run->exit_reason = KVM_EXIT_INTR; } - if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) { + if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || + vcpu->arch.power_off || vcpu->arch.pause) { local_irq_enable(); + kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); preempt_enable(); - kvm_timer_sync_hwstate(vcpu); continue; } @@ -596,14 +635,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * guest time. */ kvm_guest_exit(); - trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + + /* + * We must sync the timer state before the vgic state so that + * the vgic can properly sample the updated state of the + * interrupt line. + */ + kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); preempt_enable(); - kvm_timer_sync_hwstate(vcpu); - ret = handle_exit(vcpu, run, ret); } @@ -765,12 +809,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, vcpu_reset_hcr(vcpu); /* - * Handle the "start in power-off" case by marking the VCPU as paused. + * Handle the "start in power-off" case. */ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu->arch.pause = true; + vcpu->arch.power_off = true; else - vcpu->arch.pause = false; + vcpu->arch.power_off = false; return 0; } @@ -1080,7 +1124,7 @@ static int init_hyp_mode(void) */ err = kvm_timer_hyp_init(); if (err) - goto out_free_mappings; + goto out_free_context; #ifndef CONFIG_HOTPLUG_CPU free_boot_hyp_pgd(); diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index ad6f6424f1d1..0b556968a6da 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu) static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) { - vcpu->arch.pause = true; + vcpu->arch.power_off = true; } static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) @@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) */ if (!vcpu) return PSCI_RET_INVALID_PARAMS; - if (!vcpu->arch.pause) { + if (!vcpu->arch.power_off) { if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) return PSCI_RET_ALREADY_ON; else @@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) * the general puspose registers are undefined upon CPU_ON. */ *vcpu_reg(vcpu, 0) = context_id; - vcpu->arch.pause = false; + vcpu->arch.power_off = false; smp_mb(); /* Make sure the above is visible */ wq = kvm_arch_vcpu_wq(vcpu); @@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) mpidr = kvm_vcpu_get_mpidr_aff(tmp); if ((mpidr & target_affinity_mask) == target_affinity) { matching_cpus++; - if (!tmp->arch.pause) + if (!tmp->arch.power_off) return PSCI_0_2_AFFINITY_LEVEL_ON; } } @@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type) * re-initialized. */ kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - tmp->arch.pause = true; + tmp->arch.power_off = true; kvm_vcpu_kick(tmp); } diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index 0ec35392d208..c25a88598eb0 100644 --- a/arch/arm/kvm/trace.h +++ b/arch/arm/kvm/trace.h @@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry, ); TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc), - TP_ARGS(exit_reason, vcpu_pc), + TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc), + TP_ARGS(idx, exit_reason, vcpu_pc), TP_STRUCT__entry( + __field( int, idx ) __field( unsigned int, exit_reason ) __field( unsigned long, vcpu_pc ) ), TP_fast_assign( + __entry->idx = idx; __entry->exit_reason = exit_reason; __entry->vcpu_pc = vcpu_pc; ), - TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx", + TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx", + __print_symbolic(__entry->idx, kvm_arm_exception_type), __entry->exit_reason, + __print_symbolic(__entry->exit_reason, kvm_arm_exception_class), __entry->vcpu_pc) ); diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 9694f2654593..5e6857b6bdc4 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -200,4 +200,20 @@ /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) +#define kvm_arm_exception_type \ + {0, "IRQ" }, \ + {1, "TRAP" } + +#define ECN(x) { ESR_ELx_EC_##x, #x } + +#define kvm_arm_exception_class \ + ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \ + ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \ + ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \ + ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \ + ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \ + ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \ + ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ + ECN(BKPT32), ECN(VECTOR32), ECN(BRK64) + #endif /* __ARM64_KVM_ARM_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ed039688c221..a35ce7266aac 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -149,7 +149,10 @@ struct kvm_vcpu_arch { u32 mdscr_el1; } guest_debug_preserved; - /* Don't run the guest */ + /* vcpu power-off state */ + bool power_off; + + /* Don't run the guest (internal implementation need) */ bool pause; /* IO related fields */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 5c7e920e4861..38102f5d3cbb 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -41,4 +41,6 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +source drivers/vhost/Kconfig + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index e5836138ec42..1599701ef044 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -880,6 +880,14 @@ __kvm_hyp_panic: bl __restore_sysregs + /* + * Make sure we have a valid host stack, and don't leave junk in the + * frame pointer that will give us a misleading host stack unwinding. + */ + ldr x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)] + msr sp_el1, x22 + mov x29, xzr + 1: adr x0, __hyp_panic_str adr x1, 2f ldp x2, x3, [x1] diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 5a1a882e0a75..6ded8d347af9 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 827a38d7a9db..c9f122d00920 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -718,5 +718,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 8ced426091e1..72a614c68ed8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 53deb2750bf6..9265196e877f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1261,4 +1261,8 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); + +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index e1e4d7c38dda..1800227af9d6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -51,7 +51,7 @@ struct arch_timer_cpu { bool armed; /* Timer IRQ */ - const struct kvm_irq_level *irq; + struct kvm_irq_level irq; /* VGIC mapping */ struct irq_phys_map *map; @@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value); bool kvm_timer_should_fire(struct kvm_vcpu *vcpu); +void kvm_timer_schedule(struct kvm_vcpu *vcpu); +void kvm_timer_unschedule(struct kvm_vcpu *vcpu); #endif diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 4e14dac282bb..f62addc17dcf 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -112,7 +112,6 @@ struct vgic_vmcr { struct vgic_ops { struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int); void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr); - void (*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr); u64 (*get_elrsr)(const struct kvm_vcpu *vcpu); u64 (*get_eisr)(const struct kvm_vcpu *vcpu); void (*clear_eisr)(struct kvm_vcpu *vcpu); @@ -159,7 +158,6 @@ struct irq_phys_map { u32 virt_irq; u32 phys_irq; u32 irq; - bool active; }; struct irq_phys_map_entry { @@ -296,22 +294,16 @@ struct vgic_v3_cpu_if { }; struct vgic_cpu { - /* per IRQ to LR mapping */ - u8 *vgic_irq_lr_map; - /* Pending/active/both interrupts on this VCPU */ - DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS); - DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS); + DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS); /* Pending/active/both shared interrupts, dynamically sized */ unsigned long *pending_shared; unsigned long *active_shared; unsigned long *pend_act_shared; - /* Bitmap of used/free list registers */ - DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS); - /* Number of list registers on this CPU */ int nr_lr; @@ -354,8 +346,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu); struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int irq); int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map); -bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map); -void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active); #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index eba9caebc9c1..87189a41d904 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -647,6 +647,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); +void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 48c6e1ac6827..21a0ab2d8919 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -28,6 +28,8 @@ #include #include +#include "trace.h" + static struct timecounter *timecounter; static struct workqueue_struct *wqueue; static unsigned int host_vtimer_irq; @@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer) } } -static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu) -{ - int ret; - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - - kvm_vgic_set_phys_irq_active(timer->map, true); - ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, - timer->map, - timer->irq->level); - WARN_ON(ret); -} - static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; @@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } +static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) && + (timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE); +} + bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; cycle_t cval, now; - if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) || - !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) || - kvm_vgic_get_phys_irq_active(timer->map)) + if (!kvm_timer_irq_can_fire(vcpu)) return false; cval = timer->cntv_cval; @@ -127,62 +123,143 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu) return cval <= now; } -/** - * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu - * @vcpu: The vcpu pointer - * - * Disarm any pending soft timers, since the world-switch code will write the - * virtual timer state back to the physical CPU. +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level) +{ + int ret; + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + BUG_ON(!vgic_initialized(vcpu->kvm)); + + timer->irq.level = new_level; + trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq, + timer->irq.level); + ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id, + timer->map, + timer->irq.level); + WARN_ON(ret); +} + +/* + * Check if there was a change in the timer state (should we raise or lower + * the line level to the GIC). */ -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +static void kvm_timer_update_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; /* - * We're about to run this vcpu again, so there is no need to - * keep the background timer running, as we're about to - * populate the CPU timer again. + * If userspace modified the timer registers via SET_ONE_REG before + * the vgic was initialized, we mustn't set the timer->irq.level value + * because the guest would never see the interrupt. Instead wait + * until we call this function from kvm_timer_flush_hwstate. */ - timer_disarm(timer); + if (!vgic_initialized(vcpu->kvm)) + return; + + if (kvm_timer_should_fire(vcpu) != timer->irq.level) + kvm_timer_update_irq(vcpu, !timer->irq.level); +} + +/* + * Schedule the background timer before calling kvm_vcpu_block, so that this + * thread is removed from its waitqueue and made runnable when there's a timer + * interrupt to handle. + */ +void kvm_timer_schedule(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + u64 ns; + cycle_t cval, now; + + BUG_ON(timer_is_armed(timer)); /* - * If the timer expired while we were not scheduled, now is the time - * to inject it. + * No need to schedule a background timer if the guest timer has + * already expired, because kvm_vcpu_block will return before putting + * the thread to sleep. */ if (kvm_timer_should_fire(vcpu)) - kvm_timer_inject_irq(vcpu); + return; + + /* + * If the timer is not capable of raising interrupts (disabled or + * masked), then there's no more work for us to do. + */ + if (!kvm_timer_irq_can_fire(vcpu)) + return; + + /* The timer has not yet expired, schedule a background timer */ + cval = timer->cntv_cval; + now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; + + ns = cyclecounter_cyc2ns(timecounter->cc, + cval - now, + timecounter->mask, + &timecounter->frac); + timer_arm(timer, ns); +} + +void kvm_timer_unschedule(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + timer_disarm(timer); +} + +/** + * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu + * @vcpu: The vcpu pointer + * + * Check if the virtual timer has expired while we were running in the host, + * and inject an interrupt if that was the case. + */ +void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + bool phys_active; + int ret; + + kvm_timer_update_state(vcpu); + + /* + * If we enter the guest with the virtual input level to the VGIC + * asserted, then we have already told the VGIC what we need to, and + * we don't need to exit from the guest until the guest deactivates + * the already injected interrupt, so therefore we should set the + * hardware active state to prevent unnecessary exits from the guest. + * + * Conversely, if the virtual input level is deasserted, then always + * clear the hardware active state to ensure that hardware interrupts + * from the timer triggers a guest exit. + */ + if (timer->irq.level) + phys_active = true; + else + phys_active = false; + + ret = irq_set_irqchip_state(timer->map->irq, + IRQCHIP_STATE_ACTIVE, + phys_active); + WARN_ON(ret); } /** * kvm_timer_sync_hwstate - sync timer state from cpu * @vcpu: The vcpu pointer * - * Check if the virtual timer was armed and either schedule a corresponding - * soft timer or inject directly if already expired. + * Check if the virtual timer has expired while we were running in the guest, + * and inject an interrupt if that was the case. */ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - cycle_t cval, now; - u64 ns; BUG_ON(timer_is_armed(timer)); - if (kvm_timer_should_fire(vcpu)) { - /* - * Timer has already expired while we were not - * looking. Inject the interrupt and carry on. - */ - kvm_timer_inject_irq(vcpu); - return; - } - - cval = timer->cntv_cval; - now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff; - - ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask, - &timecounter->frac); - timer_arm(timer, ns); + /* + * The guest could have modified the timer registers or the timer + * could have expired, update the timer state. + */ + kvm_timer_update_state(vcpu); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, @@ -197,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * kvm_vcpu_set_target(). To handle this, we determine * vcpu timer irq number when the vcpu is reset. */ - timer->irq = irq; + timer->irq.irq = irq->irq; /* * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8 @@ -206,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, * the ARMv7 architecture. */ timer->cntv_ctl = 0; + kvm_timer_update_state(vcpu); /* * Tell the VGIC that the virtual interrupt is tied to a @@ -250,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) default: return -1; } + + kvm_timer_update_state(vcpu); return 0; } diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h new file mode 100644 index 000000000000..37d8b98867d5 --- /dev/null +++ b/virt/kvm/arm/trace.h @@ -0,0 +1,63 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm + +/* + * Tracepoints for vgic + */ +TRACE_EVENT(vgic_update_irq_pending, + TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( bool, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level: %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +/* + * Tracepoints for arch_timer + */ +TRACE_EVENT(kvm_timer_update_irq, + TP_PROTO(unsigned long vcpu_id, __u32 irq, int level), + TP_ARGS(vcpu_id, irq, level), + + TP_STRUCT__entry( + __field( unsigned long, vcpu_id ) + __field( __u32, irq ) + __field( int, level ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->level = level; + ), + + TP_printk("VCPU: %ld, IRQ %d, level %d", + __entry->vcpu_id, __entry->irq, __entry->level) +); + +#endif /* _TRACE_KVM_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c index 8d7b04db8471..ff02f08df74d 100644 --- a/virt/kvm/arm/vgic-v2.c +++ b/virt/kvm/arm/vgic-v2.c @@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr, lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT); vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val; -} -static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr); else @@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu) * anyway. */ vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0; + vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0; /* Get the show on the road... */ vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN; @@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu) static const struct vgic_ops vgic_v2_ops = { .get_lr = vgic_v2_get_lr, .set_lr = vgic_v2_set_lr, - .sync_lr_elrsr = vgic_v2_sync_lr_elrsr, .get_elrsr = vgic_v2_get_elrsr, .get_eisr = vgic_v2_get_eisr, .clear_eisr = vgic_v2_clear_eisr, diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c index 7dd5d62f10a1..487d6357b7e7 100644 --- a/virt/kvm/arm/vgic-v3.c +++ b/virt/kvm/arm/vgic-v3.c @@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr, } vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val; -} -static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr lr_desc) -{ if (!(lr_desc.state & LR_STATE_MASK)) vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr); else @@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu) * anyway. */ vgic_v3->vgic_vmcr = 0; + vgic_v3->vgic_elrsr = ~0; /* * If we are emulating a GICv3, we do it in an non-GICv2-compatible @@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu) static const struct vgic_ops vgic_v3_ops = { .get_lr = vgic_v3_get_lr, .set_lr = vgic_v3_set_lr, - .sync_lr_elrsr = vgic_v3_sync_lr_elrsr, .get_elrsr = vgic_v3_get_elrsr, .get_eisr = vgic_v3_get_eisr, .clear_eisr = vgic_v3_clear_eisr, diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c index 6bd1c9bf7ae7..fe451d4885ae 100644 --- a/virt/kvm/arm/vgic.c +++ b/virt/kvm/arm/vgic.c @@ -34,6 +34,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include "trace.h" + /* * How the whole thing works (courtesy of Christoffer Dall): * @@ -102,11 +105,13 @@ #include "vgic.h" static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu); -static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu); +static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu); static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr); static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc); +static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu); static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu, int virt_irq); +static int compute_pending_for_cpu(struct kvm_vcpu *vcpu); static const struct vgic_ops *vgic_ops; static const struct vgic_params *vgic; @@ -357,6 +362,11 @@ static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq) struct vgic_dist *dist = &vcpu->kvm->arch.vgic; vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0); + if (!vgic_dist_irq_get_level(vcpu, irq)) { + vgic_dist_irq_clear_pending(vcpu, irq); + if (!compute_pending_for_cpu(vcpu)) + clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); + } } static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq) @@ -654,10 +664,9 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, vgic_reg_access(mmio, &val, offset, ACCESS_READ_VALUE | ACCESS_WRITE_VALUE); if (mmio->is_write) { - if (offset < 8) { - *reg = ~0U; /* Force PPIs/SGIs to 1 */ + /* Ignore writes to read-only SGI and PPI bits */ + if (offset < 8) return false; - } val = vgic_cfg_compress(val); if (offset & 4) { @@ -683,9 +692,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio, void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 elrsr = vgic_get_elrsr(vcpu); + unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); int i; - for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) { + for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) { struct vgic_lr lr = vgic_get_lr(vcpu, i); /* @@ -706,30 +717,14 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu) * interrupt then move the active state to the * distributor tracking bit. */ - if (lr.state & LR_STATE_ACTIVE) { + if (lr.state & LR_STATE_ACTIVE) vgic_irq_set_active(vcpu, lr.irq); - lr.state &= ~LR_STATE_ACTIVE; - } /* * Reestablish the pending state on the distributor and the - * CPU interface. It may have already been pending, but that - * is fine, then we are only setting a few bits that were - * already set. + * CPU interface and mark the LR as free for other use. */ - if (lr.state & LR_STATE_PENDING) { - vgic_dist_irq_set_pending(vcpu, lr.irq); - lr.state &= ~LR_STATE_PENDING; - } - - vgic_set_lr(vcpu, i, lr); - - /* - * Mark the LR as free for other use. - */ - BUG_ON(lr.state & LR_STATE_MASK); - vgic_retire_lr(i, lr.irq, vcpu); - vgic_irq_clear_queued(vcpu, lr.irq); + vgic_retire_lr(i, vcpu); /* Finally update the VGIC state. */ vgic_update_state(vcpu->kvm); @@ -982,6 +977,12 @@ static int compute_pending_for_cpu(struct kvm_vcpu *vcpu) pend_percpu = vcpu->arch.vgic_cpu.pending_percpu; pend_shared = vcpu->arch.vgic_cpu.pending_shared; + if (!dist->enabled) { + bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS); + bitmap_zero(pend_shared, nr_shared); + return 0; + } + pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id); enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id); bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS); @@ -1009,11 +1010,6 @@ void vgic_update_state(struct kvm *kvm) struct kvm_vcpu *vcpu; int c; - if (!dist->enabled) { - set_bit(0, dist->irq_pending_on_cpu); - return; - } - kvm_for_each_vcpu(c, vcpu, kvm) { if (compute_pending_for_cpu(vcpu)) set_bit(c, dist->irq_pending_on_cpu); @@ -1036,12 +1032,6 @@ static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, vgic_ops->set_lr(vcpu, lr, vlr); } -static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr, - struct vgic_lr vlr) -{ - vgic_ops->sync_lr_elrsr(vcpu, lr, vlr); -} - static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu) { return vgic_ops->get_elrsr(vcpu); @@ -1087,16 +1077,23 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu) vgic_ops->enable(vcpu); } -static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) +static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr); + vgic_irq_clear_queued(vcpu, vlr.irq); + + /* + * We must transfer the pending state back to the distributor before + * retiring the LR, otherwise we may loose edge-triggered interrupts. + */ + if (vlr.state & LR_STATE_PENDING) { + vgic_dist_irq_set_pending(vcpu, vlr.irq); + vlr.hwirq = 0; + } + vlr.state = 0; vgic_set_lr(vcpu, lr_nr, vlr); - clear_bit(lr_nr, vgic_cpu->lr_used); - vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY; - vgic_sync_lr_elrsr(vcpu, lr_nr, vlr); } /* @@ -1110,17 +1107,15 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu) */ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + u64 elrsr = vgic_get_elrsr(vcpu); + unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); int lr; - for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) { + for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); - if (!vgic_irq_is_enabled(vcpu, vlr.irq)) { - vgic_retire_lr(lr, vlr.irq, vcpu); - if (vgic_irq_is_queued(vcpu, vlr.irq)) - vgic_irq_clear_queued(vcpu, vlr.irq); - } + if (!vgic_irq_is_enabled(vcpu, vlr.irq)) + vgic_retire_lr(lr, vcpu); } } @@ -1132,7 +1127,8 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state); vgic_irq_clear_active(vcpu, irq); vgic_update_state(vcpu->kvm); - } else if (vgic_dist_irq_is_pending(vcpu, irq)) { + } else { + WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq)); vlr.state |= LR_STATE_PENDING; kvm_debug("Set pending: 0x%x\n", vlr.state); } @@ -1159,7 +1155,6 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, } vgic_set_lr(vcpu, lr_nr, vlr); - vgic_sync_lr_elrsr(vcpu, lr_nr, vlr); } /* @@ -1169,8 +1164,9 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq, */ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; + u64 elrsr = vgic_get_elrsr(vcpu); + unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr); struct vgic_lr vlr; int lr; @@ -1181,28 +1177,22 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq) kvm_debug("Queue IRQ%d\n", irq); - lr = vgic_cpu->vgic_irq_lr_map[irq]; - /* Do we have an active interrupt for the same CPUID? */ - if (lr != LR_EMPTY) { + for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) { vlr = vgic_get_lr(vcpu, lr); - if (vlr.source == sgi_source_id) { + if (vlr.irq == irq && vlr.source == sgi_source_id) { kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq); - BUG_ON(!test_bit(lr, vgic_cpu->lr_used)); vgic_queue_irq_to_lr(vcpu, irq, lr, vlr); return true; } } /* Try to use another LR for this interrupt */ - lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used, - vgic->nr_lr); + lr = find_first_bit(elrsr_ptr, vgic->nr_lr); if (lr >= vgic->nr_lr) return false; kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id); - vgic_cpu->vgic_irq_lr_map[irq] = lr; - set_bit(lr, vgic_cpu->lr_used); vlr.irq = irq; vlr.source = sgi_source_id; @@ -1240,7 +1230,7 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; unsigned long *pa_percpu, *pa_shared; - int i, vcpu_id, lr, ret; + int i, vcpu_id; int overflow = 0; int nr_shared = vgic_nr_shared_irqs(dist); @@ -1295,39 +1285,62 @@ static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) */ clear_bit(vcpu_id, dist->irq_pending_on_cpu); } +} - for (lr = 0; lr < vgic->nr_lr; lr++) { - struct vgic_lr vlr; +static int process_queued_irq(struct kvm_vcpu *vcpu, + int lr, struct vgic_lr vlr) +{ + int pending = 0; - if (!test_bit(lr, vgic_cpu->lr_used)) - continue; + /* + * If the IRQ was EOIed (called from vgic_process_maintenance) or it + * went from active to non-active (called from vgic_sync_hwirq) it was + * also ACKed and we we therefore assume we can clear the soft pending + * state (should it had been set) for this interrupt. + * + * Note: if the IRQ soft pending state was set after the IRQ was + * acked, it actually shouldn't be cleared, but we have no way of + * knowing that unless we start trapping ACKs when the soft-pending + * state is set. + */ + vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); - vlr = vgic_get_lr(vcpu, lr); + /* + * Tell the gic to start sampling this interrupt again. + */ + vgic_irq_clear_queued(vcpu, vlr.irq); - /* - * If we have a mapping, and the virtual interrupt is - * presented to the guest (as pending or active), then we must - * set the state to active in the physical world. See - * Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt. - */ - if (vlr.state & LR_HW) { - struct irq_phys_map *map; - map = vgic_irq_map_search(vcpu, vlr.irq); - - ret = irq_set_irqchip_state(map->irq, - IRQCHIP_STATE_ACTIVE, - true); - WARN_ON(ret); + /* Any additional pending interrupt? */ + if (vgic_irq_is_edge(vcpu, vlr.irq)) { + BUG_ON(!(vlr.state & LR_HW)); + pending = vgic_dist_irq_is_pending(vcpu, vlr.irq); + } else { + if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { + vgic_cpu_irq_set(vcpu, vlr.irq); + pending = 1; + } else { + vgic_dist_irq_clear_pending(vcpu, vlr.irq); + vgic_cpu_irq_clear(vcpu, vlr.irq); } } + + /* + * Despite being EOIed, the LR may not have + * been marked as empty. + */ + vlr.state = 0; + vlr.hwirq = 0; + vgic_set_lr(vcpu, lr, vlr); + + return pending; } static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) { u32 status = vgic_get_interrupt_status(vcpu); struct vgic_dist *dist = &vcpu->kvm->arch.vgic; - bool level_pending = false; struct kvm *kvm = vcpu->kvm; + int level_pending = 0; kvm_debug("STATUS = %08x\n", status); @@ -1342,54 +1355,22 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) { struct vgic_lr vlr = vgic_get_lr(vcpu, lr); + WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq)); - - spin_lock(&dist->lock); - vgic_irq_clear_queued(vcpu, vlr.irq); WARN_ON(vlr.state & LR_STATE_MASK); - vlr.state = 0; - vgic_set_lr(vcpu, lr, vlr); - /* - * If the IRQ was EOIed it was also ACKed and we we - * therefore assume we can clear the soft pending - * state (should it had been set) for this interrupt. - * - * Note: if the IRQ soft pending state was set after - * the IRQ was acked, it actually shouldn't be - * cleared, but we have no way of knowing that unless - * we start trapping ACKs when the soft-pending state - * is set. - */ - vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq); /* * kvm_notify_acked_irq calls kvm_set_irq() - * to reset the IRQ level. Need to release the - * lock for kvm_set_irq to grab it. + * to reset the IRQ level, which grabs the dist->lock + * so we call this before taking the dist->lock. */ - spin_unlock(&dist->lock); - kvm_notify_acked_irq(kvm, 0, vlr.irq - VGIC_NR_PRIVATE_IRQS); + spin_lock(&dist->lock); - - /* Any additional pending interrupt? */ - if (vgic_dist_irq_get_level(vcpu, vlr.irq)) { - vgic_cpu_irq_set(vcpu, vlr.irq); - level_pending = true; - } else { - vgic_dist_irq_clear_pending(vcpu, vlr.irq); - vgic_cpu_irq_clear(vcpu, vlr.irq); - } - + level_pending |= process_queued_irq(vcpu, lr, vlr); spin_unlock(&dist->lock); - - /* - * Despite being EOIed, the LR may not have - * been marked as empty. - */ - vgic_sync_lr_elrsr(vcpu, lr, vlr); } } @@ -1410,40 +1391,40 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu) /* * Save the physical active state, and reset it to inactive. * - * Return 1 if HW interrupt went from active to inactive, and 0 otherwise. + * Return true if there's a pending forwarded interrupt to queue. */ -static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr) +static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr) { + struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct irq_phys_map *map; + bool phys_active; + bool level_pending; int ret; if (!(vlr.state & LR_HW)) - return 0; + return false; map = vgic_irq_map_search(vcpu, vlr.irq); - BUG_ON(!map || !map->active); + BUG_ON(!map); ret = irq_get_irqchip_state(map->irq, IRQCHIP_STATE_ACTIVE, - &map->active); + &phys_active); WARN_ON(ret); - if (map->active) { - ret = irq_set_irqchip_state(map->irq, - IRQCHIP_STATE_ACTIVE, - false); - WARN_ON(ret); + if (phys_active) return 0; - } - return 1; + spin_lock(&dist->lock); + level_pending = process_queued_irq(vcpu, lr, vlr); + spin_unlock(&dist->lock); + return level_pending; } /* Sync back the VGIC state after a guest run */ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_dist *dist = &vcpu->kvm->arch.vgic; u64 elrsr; unsigned long *elrsr_ptr; @@ -1451,40 +1432,18 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) bool level_pending; level_pending = vgic_process_maintenance(vcpu); - elrsr = vgic_get_elrsr(vcpu); - elrsr_ptr = u64_to_bitmask(&elrsr); /* Deal with HW interrupts, and clear mappings for empty LRs */ for (lr = 0; lr < vgic->nr_lr; lr++) { - struct vgic_lr vlr; - - if (!test_bit(lr, vgic_cpu->lr_used)) - continue; - - vlr = vgic_get_lr(vcpu, lr); - if (vgic_sync_hwirq(vcpu, vlr)) { - /* - * So this is a HW interrupt that the guest - * EOI-ed. Clean the LR state and allow the - * interrupt to be sampled again. - */ - vlr.state = 0; - vlr.hwirq = 0; - vgic_set_lr(vcpu, lr, vlr); - vgic_irq_clear_queued(vcpu, vlr.irq); - set_bit(lr, elrsr_ptr); - } - - if (!test_bit(lr, elrsr_ptr)) - continue; - - clear_bit(lr, vgic_cpu->lr_used); + struct vgic_lr vlr = vgic_get_lr(vcpu, lr); + level_pending |= vgic_sync_hwirq(vcpu, lr, vlr); BUG_ON(vlr.irq >= dist->nr_irqs); - vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY; } /* Check if we still have something up our sleeve... */ + elrsr = vgic_get_elrsr(vcpu); + elrsr_ptr = u64_to_bitmask(&elrsr); pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr); if (level_pending || pending < vgic->nr_lr) set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu); @@ -1574,6 +1533,8 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, int enabled; bool ret = true, can_inject = true; + trace_vgic_update_irq_pending(cpuid, irq_num, level); + if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020)) return -EINVAL; @@ -1607,8 +1568,12 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid, } else { if (level_triggered) { vgic_dist_irq_clear_level(vcpu, irq_num); - if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) + if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) { vgic_dist_irq_clear_pending(vcpu, irq_num); + vgic_cpu_irq_clear(vcpu, irq_num); + if (!compute_pending_for_cpu(vcpu)) + clear_bit(cpuid, dist->irq_pending_on_cpu); + } } ret = false; @@ -1848,30 +1813,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu) kfree(entry); } -/** - * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ - * - * Return the logical active state of a mapped interrupt. This doesn't - * necessarily reflects the current HW state. - */ -bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map) -{ - BUG_ON(!map); - return map->active; -} - -/** - * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ - * - * Set the logical active state of a mapped interrupt. This doesn't - * immediately affects the HW state. - */ -void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active) -{ - BUG_ON(!map); - map->active = active; -} - /** * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping * @vcpu: The VCPU pointer @@ -1927,12 +1868,10 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->pending_shared); kfree(vgic_cpu->active_shared); kfree(vgic_cpu->pend_act_shared); - kfree(vgic_cpu->vgic_irq_lr_map); vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list); vgic_cpu->pending_shared = NULL; vgic_cpu->active_shared = NULL; vgic_cpu->pend_act_shared = NULL; - vgic_cpu->vgic_irq_lr_map = NULL; } static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) @@ -1943,18 +1882,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs) vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL); vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL); vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL); - vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL); if (!vgic_cpu->pending_shared || !vgic_cpu->active_shared - || !vgic_cpu->pend_act_shared - || !vgic_cpu->vgic_irq_lr_map) { + || !vgic_cpu->pend_act_shared) { kvm_vgic_vcpu_destroy(vcpu); return -ENOMEM; } - memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs); - /* * Store the number of LRs per vcpu, so we don't have to go * all the way to the distributor structure to find out. Only @@ -2096,14 +2031,24 @@ int vgic_init(struct kvm *kvm) break; } - for (i = 0; i < dist->nr_irqs; i++) { - if (i < VGIC_NR_PPIS) + /* + * Enable and configure all SGIs to be edge-triggere and + * configure all PPIs as level-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + if (i < VGIC_NR_SGIS) { + /* SGIs */ vgic_bitmap_set_irq_val(&dist->irq_enabled, vcpu->vcpu_id, i, 1); - if (i < VGIC_NR_PRIVATE_IRQS) vgic_bitmap_set_irq_val(&dist->irq_cfg, vcpu->vcpu_id, i, VGIC_CFG_EDGE); + } else if (i < VGIC_NR_PRIVATE_IRQS) { + /* PPIs */ + vgic_bitmap_set_irq_val(&dist->irq_cfg, + vcpu->vcpu_id, i, + VGIC_CFG_LEVEL); + } } vgic_enable(vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a75502c93c3e..484079efea5b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2021,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } while (single_task_running() && ktime_before(cur, stop)); } + kvm_arch_vcpu_blocking(vcpu); + for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); @@ -2034,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) finish_wait(&vcpu->wq, &wait); cur = ktime_get(); + kvm_arch_vcpu_unblocking(vcpu); out: block_ns = ktime_to_ns(cur) - ktime_to_ns(start);