From 91269b8f94eedce1767b2f208d656e5a5683326a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 25 Jul 2010 14:51:16 +0300 Subject: KVM: x86 emulator: fix handling for unemulated instructions If an instruction is present in the decode tables but not in the execution switch, it will be emulated as a NOP. An example is IRET (0xcf). Fix by adding default: labels to the execution switches. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 66ca98aafdd6..70e47d3593d8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3028,6 +3028,8 @@ special_insn: if (c->modrm_reg == 5) goto jump_far; goto grp45; + default: + goto cannot_emulate; } writeback: @@ -3353,6 +3355,8 @@ twobyte_insn: if (rc != X86EMUL_CONTINUE) goto done; break; + default: + goto cannot_emulate; } goto writeback; -- cgit v1.2.3 From 83babbca4617ab086621fe65a71a2168420f1d88 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:39 +0300 Subject: KVM: x86 emulator: add macros for repetitive instructions Some instructions are repetitive in the opcode space, add macros for consolidating them. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 70e47d3593d8..c5c42e041e48 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -94,6 +94,15 @@ #define Src2One (3<<29) #define Src2Mask (7<<29) +#define X2(x) (x), (x) +#define X3(x) X2(x), (x) +#define X4(x) X2(x), X2(x) +#define X5(x) X4(x), (x) +#define X6(x) X4(x), X2(x) +#define X7(x) X4(x), X3(x) +#define X8(x) X4(x), X4(x) +#define X16(x) X8(x), X8(x) + enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, -- cgit v1.2.3 From 749358a6b4691bfd2abfa9e4be2142af4697de3a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:40 +0300 Subject: KVM: x86 emulator: consolidate inc/dec reg decoding Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c5c42e041e48..65d896015456 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -147,10 +147,8 @@ static u32 opcode_table[256] = { ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x40 - 0x4F */ + X16(DstReg), /* 0x50 - 0x57 */ SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, -- cgit v1.2.3 From 3849186c381e2e6291828579c382662520b44696 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:41 +0300 Subject: KVM: x86 emulator: consolidate push/pop reg decoding Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 65d896015456..68e5b73d22ab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -150,11 +150,9 @@ static u32 opcode_table[256] = { /* 0x40 - 0x4F */ X16(DstReg), /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + X8(SrcReg | Stack), /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + X8(DstReg | Stack), /* 0x60 - 0x67 */ ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , -- cgit v1.2.3 From b3ab3405fe3d40ae9c5350ee014c7c086fcf3d97 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:42 +0300 Subject: KVM: x86 emulator: consolidate Jcc rel8 decoding Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 68e5b73d22ab..78708211f18b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -161,12 +161,8 @@ static u32 opcode_table[256] = { SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x78 - 0x7F */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x70 - 0x7F */ + X16(SrcImmByte), /* 0x80 - 0x87 */ Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, -- cgit v1.2.3 From b6e6153885d6463896d9b465e59b361eac60efa0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:43 +0300 Subject: KVM: x86 emulator: consolidate MOV reg, imm decoding Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 78708211f18b..a6ce7f1cf8ff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -188,15 +188,9 @@ static u32 opcode_table[256] = { ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, ByteOp | DstDI | String, DstDI | String, /* 0xB0 - 0xB7 */ - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + X8(ByteOp | DstReg | SrcImm | Mov), /* 0xB8 - 0xBF */ - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + X8(DstReg | SrcImm | Mov), /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, -- cgit v1.2.3 From be8eacddbd8ee60506a6f940b3efb93cb61d7861 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:44 +0300 Subject: KVM: x86 emulator: consolidate CMOVcc decoding Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a6ce7f1cf8ff..0526be187191 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -238,16 +238,8 @@ static u32 twobyte_table[256] = { ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x40 - 0x4F */ + X16(DstReg | SrcMem | ModRM | Mov), /* 0x50 - 0x5F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 - 0x6F */ -- cgit v1.2.3 From 880a1883785d37287e13e4faf3fe92b294404de0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:45 +0300 Subject: KVM: x86 emulator: consolidate Jcc rel32 decoding Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0526be187191..fd4073546cac 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -247,8 +247,7 @@ static u32 twobyte_table[256] = { /* 0x70 - 0x7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x8F */ - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + X16(SrcImm), /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ -- cgit v1.2.3 From 2ce495365f6cdd5792c4db0ddb8ac8544950b671 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:46 +0300 Subject: KVM: x86 emulator: Make group storage bits separate from operand bits Currently group bits are stored in bits 0:7, where operand bits are stored. Make group bits be 0:3, and move the existing bits 0:3 to 16:19, so we can mix group and operand bits. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fd4073546cac..61139e20b899 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -46,15 +46,15 @@ */ /* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ +#define ByteOp (1<<16) /* 8-bit operands. */ /* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstDI (5<<1) /* Destination is in ES:(E)DI */ -#define DstMem64 (6<<1) /* 64bit memory operand */ -#define DstMask (7<<1) +#define ImplicitOps (1<<17) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<17) /* Register operand. */ +#define DstMem (3<<17) /* Memory operand. */ +#define DstAcc (4<<17) /* Destination Accumulator */ +#define DstDI (5<<17) /* Destination is in ES:(E)DI */ +#define DstMem64 (6<<17) /* 64bit memory operand */ +#define DstMask (7<<17) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ @@ -82,7 +82,7 @@ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define GroupMask 0xff /* Group number stored in bits 0:7 */ +#define GroupMask 0x0f /* Group number stored in bits 0:3 */ /* Misc flags */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ -- cgit v1.2.3 From 047a4818094217a1323d8f31f9318ea2e142f745 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:47 +0300 Subject: KVM: x86 emulator: add Undefined decode flag Add a decode flag to indicate the instruction is invalid. Will come in useful later, when we mix decode bits from the opcode and group table. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 61139e20b899..b1e3e8c2aff5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -84,6 +84,7 @@ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0x0f /* Group number stored in bits 0:3 */ /* Misc flags */ +#define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) @@ -1065,7 +1066,7 @@ done_prefixes: } /* Unrecognised? */ - if (c->d == 0) { + if (c->d == 0 || (c->d & Undefined)) { DPRINTF("Cannot emulate %02x\n", c->b); return -1; } -- cgit v1.2.3 From 52811d7de565b2db988257591fbf2a6be31c1459 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:48 +0300 Subject: KVM: x86 emulator: mix decode bits from opcode and group decode tables Allow bits that are common to all members of a group to be specified in the opcode table instead of the group table. This allows some simplification of the decode tables. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b1e3e8c2aff5..ef2b5af33a37 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -955,7 +955,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group; + int def_op_bytes, def_ad_bytes, group, dual; /* we cannot decode insn before we complete previous rep insn */ @@ -1055,14 +1055,16 @@ done_prefixes: if (c->d & Group) { group = c->d & GroupMask; + dual = c->d & GroupDual; c->modrm = insn_fetch(u8, 1, c->eip); --c->eip; group = (group << 3) + ((c->modrm >> 3) & 7); - if ((c->d & GroupDual) && (c->modrm >> 6) == 3) - c->d = group2_table[group]; + c->d &= ~(Group | GroupDual | GroupMask); + if (dual && (c->modrm >> 6) == 3) + c->d |= group2_table[group]; else - c->d = group_table[group]; + c->d |= group_table[group]; } /* Unrecognised? */ -- cgit v1.2.3 From 4968ec4e26007770d8759fbface4d4712a27b5d4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:49 +0300 Subject: KVM: x86 emulator: simplify Group 1 decoding Move operand decoding to the opcode table, keep lock decoding in the group table. This allows us to get consolidate the four variants of Group 1 into one group. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 47 +++++++---------------------------------------- 1 file changed, 7 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ef2b5af33a37..1ce9c6de0aea 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,8 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - Group1_80, Group1_81, Group1_82, Group1_83, - Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group1, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, Group8, Group9, }; @@ -165,8 +164,10 @@ static u32 opcode_table[256] = { /* 0x70 - 0x7F */ X16(SrcImmByte), /* 0x80 - 0x87 */ - Group | Group1_80, Group | Group1_81, - Group | Group1_82, Group | Group1_83, + ByteOp | DstMem | SrcImm | ModRM | Group | Group1, + DstMem | SrcImm | ModRM | Group | Group1, + ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1, + DstMem | SrcImmByte | ModRM | Group | Group1, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ @@ -285,42 +286,8 @@ static u32 twobyte_table[256] = { }; static u32 group_table[] = { - [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM, - [Group1_81*8] = - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM, - [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64, - [Group1_83*8] = - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM, + [Group1*8] = + X7(Lock), 0, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = -- cgit v1.2.3 From dfe11481d8f1b6a7354c34cb252ff1a8af233cfe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:50 +0300 Subject: KVM: x86 emulator: Allow LOCK prefix for NEG and NOT Opcodes F6/2, F6/3, F7/2, F7/3. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1ce9c6de0aea..bbe2d097c4ae 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -292,11 +292,11 @@ static u32 group_table[] = { DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, [Group3*8] = DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, [Group4*8] = ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, -- cgit v1.2.3 From e071edd5ba8dd7a493eef229d495cf6232b09534 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 14:37:51 +0300 Subject: KVM: x86 emulator: unify the two Group 3 variants Use just one group table for byte (F6) and word (F7) opcodes. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bbe2d097c4ae..7f615c57cbad 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,8 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - Group1, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, - Group8, Group9, + Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, }; static u32 opcode_table[256] = { @@ -217,7 +216,7 @@ static u32 opcode_table[256] = { SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, ByteOp | Group | Group3, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -290,14 +289,10 @@ static u32 group_table[] = { X7(Lock), 0, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, - [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, - ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, - 0, 0, 0, 0, [Group3*8] = DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, - 0, 0, 0, 0, + X4(Undefined), [Group4*8] = ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, 0, 0, -- cgit v1.2.3 From d359192feaf02861327339a9dda6b2b2d765c2bc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 Jul 2010 18:32:39 +0300 Subject: KVM: VMX: Use host_gdt variable wherever we need the host gdt Now that we have the host gdt conveniently stored in a variable, make use of it instead of querying the cpu. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7bddfab12013..751a2d29f4ce 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -706,11 +706,10 @@ static void reload_tss(void) /* * VT restores TR but not its size. Useless. */ - struct desc_ptr gdt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); struct desc_struct *descs; - native_store_gdt(&gdt); - descs = (void *)gdt.address; + descs = (void *)gdt->address; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); } @@ -753,7 +752,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { - struct desc_ptr gdt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -761,8 +760,7 @@ static unsigned long segment_base(u16 selector) if (!(selector & ~3)) return 0; - native_store_gdt(&gdt); - table_base = gdt.address; + table_base = gdt->address; if (selector & 4) { /* from ldt */ u16 ldt_selector = kvm_read_ldt(); @@ -897,7 +895,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vcpu->cpu != cpu) { - struct desc_ptr dt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); unsigned long sysenter_esp; kvm_migrate_timers(vcpu); @@ -913,8 +911,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. */ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - native_store_gdt(&dt); - vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ + vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ -- cgit v1.2.3 From 19ada5c4b6170bbc7ac4f2f38dba0068fdc7755a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 27 Jul 2010 11:21:18 +0800 Subject: KVM: MMU: remove valueless output message After commit 53383eaad08d, the '*spte' has updated before call rmap_remove()(in most case it's 'shadow_trap_nonpresent_pte'), so remove this information from error message Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 311f6dad8951..82f7622c17d3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -645,18 +645,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); BUG(); } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + rmap_printk("rmap_remove: %p 1->0\n", spte); if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", - spte, *spte); + printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); BUG(); } *rmapp = 0; } else { - rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + rmap_printk("rmap_remove: %p many->many\n", spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { @@ -670,7 +669,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } - pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); + pr_err("rmap_remove: %p many->many\n", spte); BUG(); } } -- cgit v1.2.3 From 3f6a9d1693deaeef28d98109bc92c98dd94a8523 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Jul 2010 18:14:20 +0200 Subject: KVM: SVM: Sync efer back into nested vmcb This patch fixes a bug in a nested hypervisor that heavily switches between real-mode and long-mode. The problem is fixed by syncing back efer into the guest vmcb on emulated vmexit. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a3f9f64f86f..09704a0501d7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1896,6 +1896,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; + nested_vmcb->save.efer = svm->vcpu.arch.efer; nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; -- cgit v1.2.3 From 7a190667bb316653cbb782fff95cfdfcf51ded45 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 27 Jul 2010 18:14:21 +0200 Subject: KVM: SVM: Emulate next_rip svm feature This patch implements the emulations of the svm next_rip feature in the nested svm implementation in kvm. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 09704a0501d7..116e0341bf4c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1918,6 +1918,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -3360,7 +3361,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper ASID emulation to nested SVM */ entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Do not support any additional features */ + entry->edx = 0; /* Per default do not support any + additional features */ + + /* Support next_rip if host supports it */ + if (svm_has(SVM_FEATURE_NRIP)) + entry->edx |= SVM_FEATURE_NRIP; break; } -- cgit v1.2.3 From 62bd430e6d41ac84ff2fb719f5783c3692718f47 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 28 Jul 2010 12:38:40 +0300 Subject: KVM: x86 emulator: Add IRET instruction Ths patch adds IRET instruction (opcode 0xcf). Currently, only IRET in real mode is emulated. Protected mode support is to be added later if needed. Signed-off-by: Mohammed Gamal Reviewed-by: Avi Kivity Reviewed-by: Paolo Bonzini Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7f615c57cbad..b0f45bc63e1c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -341,6 +341,9 @@ static u32 group2_table[] = { #define EFLG_PF (1<<2) #define EFLG_CF (1<<0) +#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a +#define EFLG_RESERVED_ONE_MASK 2 + /* * Instruction emulation: * Most instructions are emulated directly via a fragment of inline assembly @@ -1729,6 +1732,78 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, return rc; } +static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + unsigned long temp_eip = 0; + unsigned long temp_eflags = 0; + unsigned long cs = 0; + unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | + EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | + EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ + unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; + + /* TODO: Add stack limit check */ + + rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (temp_eip & ~0xffff) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = temp_eip; + + + if (c->op_bytes == 4) + ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); + else if (c->op_bytes == 2) { + ctxt->eflags &= ~0xffff; + ctxt->eflags |= temp_eflags; + } + + ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + + return rc; +} + +static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops* ops) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_iret_real(ctxt, ops); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* iret from protected mode unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -2857,6 +2932,12 @@ special_insn: break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; + break; + case 0xcf: /* iret */ + rc = emulate_iret(ctxt, ops); + if (rc != X86EMUL_CONTINUE) goto done; break; -- cgit v1.2.3 From ea9ef04e19c7c441b1ce9fe28ff6d9522c848baa Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:34 +0300 Subject: KVM: x86 emulator: drop parentheses in repreat macros The parenthese make is impossible to use the macros with initializers that require braces. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b0f45bc63e1c..3bfba9480975 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -95,10 +95,10 @@ #define Src2One (3<<29) #define Src2Mask (7<<29) -#define X2(x) (x), (x) -#define X3(x) X2(x), (x) +#define X2(x) x, x +#define X3(x) X2(x), x #define X4(x) X2(x), X2(x) -#define X5(x) X4(x), (x) +#define X5(x) X4(x), x #define X6(x) X4(x), X2(x) #define X7(x) X4(x), X3(x) #define X8(x) X4(x), X4(x) -- cgit v1.2.3 From d65b1dee408243daa45110ee494d204508d31657 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:35 +0300 Subject: KVM: x86 emulator: introduce 'struct opcode' This will hold all the information known about the opcode. Currently, this is just the decode flags. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3bfba9480975..da7df34036ca 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -108,7 +108,11 @@ enum { Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, }; -static u32 opcode_table[256] = { +struct opcode { + u32 flags; +}; + +static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -222,7 +226,7 @@ static u32 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; -static u32 twobyte_table[256] = { +static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps | Priv, 0, @@ -284,7 +288,7 @@ static u32 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -static u32 group_table[] = { +static struct opcode group_table[] = { [Group1*8] = X7(Lock), 0, [Group1A*8] = @@ -313,7 +317,7 @@ static u32 group_table[] = { 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; -static u32 group2_table[] = { +static struct opcode group2_table[] = { [Group7*8] = SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, SrcNone | ModRM | DstMem | Mov, 0, @@ -1008,13 +1012,13 @@ done_prefixes: c->op_bytes = 8; /* REX.W */ /* Opcode byte(s). */ - c->d = opcode_table[c->b]; + c->d = opcode_table[c->b].flags; if (c->d == 0) { /* Two-byte opcode? */ if (c->b == 0x0f) { c->twobyte = 1; c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; + c->d = twobyte_table[c->b].flags; } } @@ -1027,9 +1031,9 @@ done_prefixes: group = (group << 3) + ((c->modrm >> 3) & 7); c->d &= ~(Group | GroupDual | GroupMask); if (dual && (c->modrm >> 6) == 3) - c->d |= group2_table[group]; + c->d |= group2_table[group].flags; else - c->d |= group_table[group]; + c->d |= group_table[group].flags; } /* Unrecognised? */ -- cgit v1.2.3 From fd853310a1ebaef257956208165873494bb805dc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:36 +0300 Subject: KVM: x86 emulator: Add wrappers for easily defining opcodes Once 'struct opcode' grows, its initializer will become more complicated. Wrap the simple initializers in a D() macro, and replace the empty initializers with an even simpler N macro. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 294 +++++++++++++++++++++++++------------------------ 1 file changed, 150 insertions(+), 144 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index da7df34036ca..7059b1611970 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -112,220 +112,226 @@ struct opcode { u32 flags; }; +#define D(_y) { .flags = (_y) } +#define N D(0) + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, 0, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - 0, 0, + D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + N, N, /* 0x40 - 0x4F */ - X16(DstReg), + X16(D(DstReg)), /* 0x50 - 0x57 */ - X8(SrcReg | Stack), + X8(D(SrcReg | Stack)), /* 0x58 - 0x5F */ - X8(DstReg | Stack), + X8(D(DstReg | Stack)), /* 0x60 - 0x67 */ - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, N, N, N, /* 0x68 - 0x6F */ - SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ - SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ + D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N, + D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ + D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ - X16(SrcImmByte), + X16(D(SrcImmByte)), /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM | Group | Group1, - DstMem | SrcImm | ModRM | Group | Group1, - ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1, - DstMem | SrcImmByte | ModRM | Group | Group1, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + D(ByteOp | DstMem | SrcImm | ModRM | Group | Group1), + D(DstMem | SrcImm | ModRM | Group | Group1), + D(ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1), + D(DstMem | SrcImmByte | ModRM | Group | Group1), + D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, - ImplicitOps | SrcMem16 | ModRM, Group | Group1A, + D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov), + D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), + D(ImplicitOps | SrcMem16 | ModRM), D(Group | Group1A), /* 0x90 - 0x97 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), /* 0x98 - 0x9F */ - 0, 0, SrcImmFAddr | No64, 0, - ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + N, N, D(SrcImmFAddr | No64), N, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, /* 0xA0 - 0xA7 */ - ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, - ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, - ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, + D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), + D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs), + D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String), + D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String), /* 0xA8 - 0xAF */ - DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, - ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, - ByteOp | DstDI | String, DstDI | String, + D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String), + D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String), + D(ByteOp | DstDI | String), D(DstDI | String), /* 0xB0 - 0xB7 */ - X8(ByteOp | DstReg | SrcImm | Mov), + X8(D(ByteOp | DstReg | SrcImm | Mov)), /* 0xB8 - 0xBF */ - X8(DstReg | SrcImm | Mov), + X8(D(DstReg | SrcImm | Mov)), /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM), + N, D(ImplicitOps | Stack), N, N, + D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov), /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, + N, N, N, D(ImplicitOps | Stack), + D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, + D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), + D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), + N, N, N, N, /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, - ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, - ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, + N, N, N, N, + D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), + D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), /* 0xE8 - 0xEF */ - SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmFAddr | No64, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, - SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, + D(SrcImm | Stack), D(SrcImm | ImplicitOps), + D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), + D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), + D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps | Priv, ImplicitOps, ByteOp | Group | Group3, Group | Group3, + N, N, N, N, + D(ImplicitOps | Priv), D(ImplicitOps), D(ByteOp | Group | Group3), D(Group | Group3), /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, + D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5), }; static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, - 0, ImplicitOps, ImplicitOps | Priv, 0, - ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, - 0, ImplicitOps | ModRM, 0, 0, + N, D(Group | GroupDual | Group7), N, N, + N, D(ImplicitOps), D(ImplicitOps | Priv), N, + D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - ModRM | ImplicitOps | Priv, ModRM | Priv, - ModRM | ImplicitOps | Priv, ModRM | Priv, - 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), + D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), + N, N, N, N, + N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, - ImplicitOps, ImplicitOps | Priv, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N, + D(ImplicitOps), D(ImplicitOps | Priv), N, N, + N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ - X16(DstReg | SrcMem | ModRM | Mov), + X16(D(DstReg | SrcMem | ModRM | Mov)), /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x80 - 0x8F */ - X16(SrcImm), + X16(D(SrcImm)), /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xA0 - 0xA7 */ - ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, 0, 0, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ - ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp | Lock, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, - ModRM, 0, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), + D(ModRM), N, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - 0, DstMem | SrcReg | ModRM | BitOp | Lock, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), + D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ - 0, 0, - Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, + N, N, + D(Group | Group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), + D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, - 0, 0, 0, Group | GroupDual | Group9, - 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, D(DstMem | SrcReg | ModRM | Mov), + N, N, N, D(Group | GroupDual | Group9), + N, N, N, N, N, N, N, N, /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; static struct opcode group_table[] = { [Group1*8] = - X7(Lock), 0, + X7(D(Lock)), N, [Group1A*8] = - DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, [Group3*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, - X4(Undefined), + D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + X4(D(Undefined)), [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, - 0, 0, 0, 0, 0, 0, + D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + N, N, N, N, N, N, [Group5*8] = - DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, - SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, - SrcMem | ModRM | Stack, 0, + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + D(SrcMem | ModRM | Stack), N, + D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), + D(SrcMem | ModRM | Stack), N, [Group7*8] = - 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), [Group8*8] = - 0, 0, 0, 0, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + N, N, N, N, + D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), [Group9*8] = - 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, + N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, }; static struct opcode group2_table[] = { [Group7*8] = - SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov | Priv, 0, + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, [Group9*8] = - 0, 0, 0, 0, 0, 0, 0, 0, + N, N, N, N, N, N, N, N, }; +#undef D +#undef N + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) -- cgit v1.2.3 From 42a1c5209570ead6d89abecd99ab12947a41d20a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:37 +0300 Subject: KVM: x86 emulator: move group tables to top No code changes. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 76 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7059b1611970..edf093861105 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -115,6 +115,44 @@ struct opcode { #define D(_y) { .flags = (_y) } #define N D(0) +static struct opcode group_table[] = { + [Group1*8] = + X7(D(Lock)), N, + [Group1A*8] = + D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, + [Group3*8] = + D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + X4(D(Undefined)), + [Group4*8] = + D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + N, N, N, N, N, N, + [Group5*8] = + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + D(SrcMem | ModRM | Stack), N, + D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), + D(SrcMem | ModRM | Stack), N, + [Group7*8] = + N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), + [Group8*8] = + N, N, N, N, + D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), + [Group9*8] = + N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, +}; + +static struct opcode group2_table[] = { + [Group7*8] = + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, + [Group9*8] = + N, N, N, N, N, N, N, N, +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), @@ -291,44 +329,6 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N }; -static struct opcode group_table[] = { - [Group1*8] = - X7(D(Lock)), N, - [Group1A*8] = - D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, - [Group3*8] = - D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(Undefined)), - [Group4*8] = - D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), - N, N, N, N, N, N, - [Group5*8] = - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - D(SrcMem | ModRM | Stack), N, - D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), - D(SrcMem | ModRM | Stack), N, - [Group7*8] = - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), - [Group8*8] = - N, N, N, N, - D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), - D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), - [Group9*8] = - N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, -}; - -static struct opcode group2_table[] = { - [Group7*8] = - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, - [Group9*8] = - N, N, N, N, N, N, N, N, -}; - #undef D #undef N -- cgit v1.2.3 From 793d5a8d6baad9062b0a03e034944b31e50dfe5c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:38 +0300 Subject: KVM: x86 emulator: reserve group code 0 We'll be using that to distinguish between new-style and old-style groups. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index edf093861105..5e496127a01c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, + NoGrp, Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, }; struct opcode { -- cgit v1.2.3 From 120df8902dbe91cc1b3b7886481e350fae7334fe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:39 +0300 Subject: KVM: x86 emulator: allow specifying group directly in opcode Instead of having a group number, store the group table pointer directly in the opcode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5e496127a01c..f3b984427d10 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -110,10 +110,21 @@ enum { struct opcode { u32 flags; + union { + struct opcode *group; + struct group_dual *gdual; + } u; +}; + +struct group_dual { + struct opcode mod012[8]; + struct opcode mod3[8]; }; #define D(_y) { .flags = (_y) } #define N D(0) +#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } static struct opcode group_table[] = { [Group1*8] = @@ -331,6 +342,8 @@ static struct opcode twobyte_table[256] = { #undef D #undef N +#undef G +#undef GD /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) @@ -930,8 +943,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group, dual; - + int def_op_bytes, def_ad_bytes, group, dual, goffset; + struct opcode opcode, *g_mod012, *g_mod3; /* we cannot decode insn before we complete previous rep insn */ WARN_ON(ctxt->restart); @@ -1018,15 +1031,16 @@ done_prefixes: c->op_bytes = 8; /* REX.W */ /* Opcode byte(s). */ - c->d = opcode_table[c->b].flags; - if (c->d == 0) { + opcode = opcode_table[c->b]; + if (opcode.flags == 0) { /* Two-byte opcode? */ if (c->b == 0x0f) { c->twobyte = 1; c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b].flags; + opcode = twobyte_table[c->b]; } } + c->d = opcode.flags; if (c->d & Group) { group = c->d & GroupMask; @@ -1034,12 +1048,27 @@ done_prefixes: c->modrm = insn_fetch(u8, 1, c->eip); --c->eip; - group = (group << 3) + ((c->modrm >> 3) & 7); + if (group) { + g_mod012 = g_mod3 = &group_table[group * 8]; + if (c->d & GroupDual) + g_mod3 = &group2_table[group * 8]; + } else { + if (c->d & GroupDual) { + g_mod012 = opcode.u.gdual->mod012; + g_mod3 = opcode.u.gdual->mod3; + } else + g_mod012 = g_mod3 = opcode.u.group; + } + c->d &= ~(Group | GroupDual | GroupMask); - if (dual && (c->modrm >> 6) == 3) - c->d |= group2_table[group].flags; + + goffset = (c->modrm >> 3) & 7; + + if ((c->modrm >> 6) == 3) + opcode = g_mod3[goffset]; else - c->d |= group_table[group].flags; + opcode = g_mod012[goffset]; + c->d |= opcode.flags; } /* Unrecognised? */ -- cgit v1.2.3 From 5b92b5faff8ec66c75f3716ae7c4bf1e2b99d7e6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:40 +0300 Subject: KVM: x86 emulator: convert group 1 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f3b984427d10..6cc4af1b59ff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group1, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, + NoGrp, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, }; struct opcode { @@ -126,9 +126,11 @@ struct group_dual { #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +static struct opcode group1[] = { + X7(D(Lock)), N +}; + static struct opcode group_table[] = { - [Group1*8] = - X7(D(Lock)), N, [Group1A*8] = D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, [Group3*8] = @@ -219,10 +221,10 @@ static struct opcode opcode_table[256] = { /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ - D(ByteOp | DstMem | SrcImm | ModRM | Group | Group1), - D(DstMem | SrcImm | ModRM | Group | Group1), - D(ByteOp | DstMem | SrcImm | ModRM | No64 | Group | Group1), - D(DstMem | SrcImmByte | ModRM | Group | Group1), + G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), + G(DstMem | SrcImm | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), + G(DstMem | SrcImmByte | ModRM | Group, group1), D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), /* 0x88 - 0x8F */ -- cgit v1.2.3 From 99880c5cd54b28a26fd6ed949f545cc0075e4393 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:41 +0300 Subject: KVM: x86 emulator: convert group 1A to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6cc4af1b59ff..618fdc8c8d06 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group1A, Group3, Group4, Group5, Group7, Group8, Group9, + NoGrp, Group3, Group4, Group5, Group7, Group8, Group9, }; struct opcode { @@ -130,9 +130,11 @@ static struct opcode group1[] = { X7(D(Lock)), N }; -static struct opcode group_table[] = { - [Group1A*8] = +static struct opcode group1A[] = { D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, +}; + +static struct opcode group_table[] = { [Group3*8] = D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), @@ -231,7 +233,7 @@ static struct opcode opcode_table[256] = { D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov), D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), - D(ImplicitOps | SrcMem16 | ModRM), D(Group | Group1A), + D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), /* 0x98 - 0x9F */ -- cgit v1.2.3 From ee70ea30ee81dda2cf5fbc2e143ce3cb303187ce Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:42 +0300 Subject: KVM: x86 emulator: convert group 3 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 618fdc8c8d06..a0606a408add 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group3, Group4, Group5, Group7, Group8, Group9, + NoGrp, Group4, Group5, Group7, Group8, Group9, }; struct opcode { @@ -134,11 +134,13 @@ static struct opcode group1A[] = { D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, }; -static struct opcode group_table[] = { - [Group3*8] = +static struct opcode group3[] = { D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), X4(D(Undefined)), +}; + +static struct opcode group_table[] = { [Group4*8] = D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), N, N, N, N, N, N, @@ -276,7 +278,7 @@ static struct opcode opcode_table[256] = { D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), /* 0xF0 - 0xF7 */ N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), D(ByteOp | Group | Group3), D(Group | Group3), + D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5), -- cgit v1.2.3 From 591c9d20a37db54c7234742bff925cb2e6fdca4b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:43 +0300 Subject: KVM: x86 emulator: convert group 4 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a0606a408add..8bb74ea2b278 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group4, Group5, Group7, Group8, Group9, + NoGrp, Group5, Group7, Group8, Group9, }; struct opcode { @@ -140,10 +140,12 @@ static struct opcode group3[] = { X4(D(Undefined)), }; -static struct opcode group_table[] = { - [Group4*8] = +static struct opcode group4[] = { D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), N, N, N, N, N, N, +}; + +static struct opcode group_table[] = { [Group5*8] = D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), D(SrcMem | ModRM | Stack), N, @@ -281,7 +283,7 @@ static struct opcode opcode_table[256] = { D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), - D(ImplicitOps), D(ImplicitOps), D(Group | Group4), D(Group | Group5), + D(ImplicitOps), D(ImplicitOps), G(0, group4), D(Group | Group5), }; static struct opcode twobyte_table[256] = { -- cgit v1.2.3 From b67f9f0741e288c97f73cdc9e39e2c4943004332 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:44 +0300 Subject: KVM: x86 emulator: convert group 5 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8bb74ea2b278..9674d973b99d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group5, Group7, Group8, Group9, + NoGrp, Group7, Group8, Group9, }; struct opcode { @@ -145,12 +145,14 @@ static struct opcode group4[] = { N, N, N, N, N, N, }; -static struct opcode group_table[] = { - [Group5*8] = +static struct opcode group5[] = { D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), D(SrcMem | ModRM | Stack), N, D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), D(SrcMem | ModRM | Stack), N, +}; + +static struct opcode group_table[] = { [Group7*8] = N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), D(SrcNone | ModRM | DstMem | Mov), N, @@ -283,7 +285,7 @@ static struct opcode opcode_table[256] = { D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), - D(ImplicitOps), D(ImplicitOps), G(0, group4), D(Group | Group5), + D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), }; static struct opcode twobyte_table[256] = { -- cgit v1.2.3 From 2f3a9bc9ebd42e00929f370e1a56e40028a8d651 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:45 +0300 Subject: KVM: x86 emulator: convert group 7 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9674d973b99d..5e7a02df18bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group7, Group8, Group9, + NoGrp, Group8, Group9, }; struct opcode { @@ -152,11 +152,17 @@ static struct opcode group5[] = { D(SrcMem | ModRM | Stack), N, }; -static struct opcode group_table[] = { - [Group7*8] = +static struct group_dual group7 = { { N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), D(SrcNone | ModRM | DstMem | Mov), N, D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), +}, { + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, +} }; + +static struct opcode group_table[] = { [Group8*8] = N, N, N, N, D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), @@ -166,10 +172,6 @@ static struct opcode group_table[] = { }; static struct opcode group2_table[] = { - [Group7*8] = - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, [Group9*8] = N, N, N, N, N, N, N, N, }; @@ -290,7 +292,7 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ - N, D(Group | GroupDual | Group7), N, N, + N, GD(0, &group7), N, N, N, D(ImplicitOps), D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, N, D(ImplicitOps | ModRM), N, N, -- cgit v1.2.3 From 2cb20bc8af313b400e5c2c94886e0d87e2ec4e4d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:46 +0300 Subject: KVM: x86 emulator: convert group 8 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5e7a02df18bb..b5599b5cac97 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group8, Group9, + NoGrp, Group9, }; struct opcode { @@ -162,11 +162,13 @@ static struct group_dual group7 = { { D(SrcMem16 | ModRM | Mov | Priv), N, } }; -static struct opcode group_table[] = { - [Group8*8] = +static struct opcode group8[] = { N, N, N, N, D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), +}; + +static struct opcode group_table[] = { [Group9*8] = N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, }; @@ -337,7 +339,7 @@ static struct opcode twobyte_table[256] = { D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, - D(Group | Group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ -- cgit v1.2.3 From 9f5d3220e3047536f702ed67309f6a581c0bed8b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:47 +0300 Subject: KVM: x86 emulator: convert group 9 to new style Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b5599b5cac97..2fe731c82299 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -105,7 +105,7 @@ #define X16(x) X8(x), X8(x) enum { - NoGrp, Group9, + NoGrp, }; struct opcode { @@ -168,14 +168,16 @@ static struct opcode group8[] = { D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), }; -static struct opcode group_table[] = { - [Group9*8] = +static struct group_dual group9 = { { N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, +}, { + N, N, N, N, N, N, N, N, +} }; + +static struct opcode group_table[] = { }; static struct opcode group2_table[] = { - [Group9*8] = - N, N, N, N, N, N, N, N, }; static struct opcode opcode_table[256] = { @@ -344,7 +346,7 @@ static struct opcode twobyte_table[256] = { D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ N, N, N, D(DstMem | SrcReg | ModRM | Mov), - N, N, N, D(Group | GroupDual | Group9), + N, N, N, GD(0, &group9), N, N, N, N, N, N, N, N, /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, -- cgit v1.2.3 From 3885d530b0eb26c82b6f085c181442b0aa6f8fed Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:48 +0300 Subject: KVM: x86 emulator: drop support for old-style groups Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2fe731c82299..20a7a167df1d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -82,7 +82,6 @@ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define GroupMask 0x0f /* Group number stored in bits 0:3 */ /* Misc flags */ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ @@ -104,10 +103,6 @@ #define X8(x) X4(x), X4(x) #define X16(x) X8(x), X8(x) -enum { - NoGrp, -}; - struct opcode { u32 flags; union { @@ -174,12 +169,6 @@ static struct group_dual group9 = { { N, N, N, N, N, N, N, N, } }; -static struct opcode group_table[] = { -}; - -static struct opcode group2_table[] = { -}; - static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), @@ -959,7 +948,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group, dual, goffset; + int def_op_bytes, def_ad_bytes, dual, goffset; struct opcode opcode, *g_mod012, *g_mod3; /* we cannot decode insn before we complete previous rep insn */ @@ -1059,24 +1048,17 @@ done_prefixes: c->d = opcode.flags; if (c->d & Group) { - group = c->d & GroupMask; dual = c->d & GroupDual; c->modrm = insn_fetch(u8, 1, c->eip); --c->eip; - if (group) { - g_mod012 = g_mod3 = &group_table[group * 8]; - if (c->d & GroupDual) - g_mod3 = &group2_table[group * 8]; - } else { - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; - } + if (c->d & GroupDual) { + g_mod012 = opcode.u.gdual->mod012; + g_mod3 = opcode.u.gdual->mod3; + } else + g_mod012 = g_mod3 = opcode.u.group; - c->d &= ~(Group | GroupDual | GroupMask); + c->d &= ~(Group | GroupDual); goffset = (c->modrm >> 3) & 7; -- cgit v1.2.3 From ab85b12b1a7fd125588f9447653a71ec8e1b5024 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:49 +0300 Subject: KVM: x86 emulator: move ByteOp and Dst back to bits 0:3 Now that the group index no longer exists, the space is free. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 20a7a167df1d..d7e3ea4797f1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -46,15 +46,15 @@ */ /* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<16) /* 8-bit operands. */ +#define ByteOp (1<<0) /* 8-bit operands. */ /* Destination operand type. */ -#define ImplicitOps (1<<17) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<17) /* Register operand. */ -#define DstMem (3<<17) /* Memory operand. */ -#define DstAcc (4<<17) /* Destination Accumulator */ -#define DstDI (5<<17) /* Destination is in ES:(E)DI */ -#define DstMem64 (6<<17) /* 64bit memory operand */ -#define DstMask (7<<17) +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstDI (5<<1) /* Destination is in ES:(E)DI */ +#define DstMem64 (6<<1) /* 64bit memory operand */ +#define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ #define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ -- cgit v1.2.3 From 9aabc88fc8687ba3a520e2ec459821d05f72474e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:50 +0300 Subject: KVM: x86 emulator: store x86_emulate_ops in emulation context It doesn't ever change, so we don't need to pass it around everywhere. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 9 ++++----- arch/x86/kvm/emulate.c | 8 +++++--- arch/x86/kvm/x86.c | 7 ++++--- 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1f99ecfc48e1..9ddfa5ed2289 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -208,6 +208,8 @@ struct decode_cache { }; struct x86_emulate_ctxt { + struct x86_emulate_ops *ops; + /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -249,12 +251,9 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d7e3ea4797f1..3689f34a303a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -943,8 +943,9 @@ done: } int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +x86_decode_insn(struct x86_emulate_ctxt *ctxt) { + struct x86_emulate_ops *ops = ctxt->ops; struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; @@ -2586,10 +2587,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct x86_emulate_ops *ops = ctxt->ops; struct decode_cache *c = &ctxt->decode; int rc; @@ -2619,8 +2620,9 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, } int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { + struct x86_emulate_ops *ops = ctxt->ops; u64 msr_data; struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a09c625d526..33deb75f16ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3998,7 +3998,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.interruptibility = 0; vcpu->arch.emulate_ctxt.exception = -1; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + r = x86_decode_insn(&vcpu->arch.emulate_ctxt); trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD @@ -4048,7 +4048,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); if (r) { /* emulation failed */ if (reexecute_instruction(vcpu, cr2)) @@ -5067,7 +5067,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, tss_selector, reason, has_error_code, error_code); @@ -5424,6 +5424,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.emulate_ctxt.ops = &emulate_ops; vcpu->arch.mmu.root_hpa = INVALID_PAGE; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -- cgit v1.2.3 From ef65c88912cafe56de2737c440aefc764fd8f202 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:51 +0300 Subject: KVM: x86 emulator: allow storing emulator execution function in decode tables Instead of looking up the opcode twice (once for decode flags, once for the big execution switch) look up both flags and function in the decode tables. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 9ddfa5ed2289..0f901c16cf1c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -190,6 +190,7 @@ struct decode_cache { bool has_seg_override; u8 seg_override; unsigned int d; + int (*execute)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3689f34a303a..799e895fb08e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -106,6 +106,7 @@ struct opcode { u32 flags; union { + int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; struct group_dual *gdual; } u; @@ -120,6 +121,7 @@ struct group_dual { #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } static struct opcode group1[] = { X7(D(Lock)), N @@ -349,6 +351,7 @@ static struct opcode twobyte_table[256] = { #undef N #undef G #undef GD +#undef I /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) @@ -1070,6 +1073,8 @@ done_prefixes: c->d |= opcode.flags; } + c->execute = opcode.u.execute; + /* Unrecognised? */ if (c->d == 0 || (c->d & Undefined)) { DPRINTF("Cannot emulate %02x\n", c->b); @@ -2705,6 +2710,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: + if (c->execute) { + rc = c->execute(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } + if (c->twobyte) goto twobyte_insn; -- cgit v1.2.3 From dde7e6d12a9ef9f727d05ce824f4fe75ca2a5b3a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:52 +0300 Subject: KVM: x86 emulator: move x86_decode_insn() downwards No code changes. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 1602 ++++++++++++++++++++++++------------------------ 1 file changed, 801 insertions(+), 801 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 799e895fb08e..c6f435917538 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -945,917 +945,545 @@ done: return rc; } -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt) +static int read_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long addr, void *dest, unsigned size) { - struct x86_emulate_ops *ops = ctxt->ops; - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset; - struct opcode opcode, *g_mod012, *g_mod3; + int rc; + struct read_cache *mc = &ctxt->decode.mem_read; + u32 err; - /* we cannot decode insn before we complete previous rep insn */ - WARN_ON(ctxt->restart); + while (size) { + int n = min(size, 8u); + size -= n; + if (mc->pos < mc->end) + goto read_cached; - c->eip = ctxt->eip; - c->fetch.start = c->fetch.end = c->eip; - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); + rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, addr, err); + if (rc != X86EMUL_CONTINUE) + return rc; + mc->end += n; - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; + read_cached: + memcpy(dest, mc->data + mc->pos, n); + mc->pos += n; + dest += n; + addr += n; } + return X86EMUL_CONTINUE; +} - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ +static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned int size, unsigned short port, + void *dest) +{ + struct read_cache *rc = &ctxt->decode.io_read; - c->rex_prefix = 0; + if (rc->pos == rc->end) { /* refill pio read ahead */ + struct decode_cache *c = &ctxt->decode; + unsigned int in_page, n; + unsigned int count = c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; + in_page = (ctxt->eflags & EFLG_DF) ? + offset_in_page(c->regs[VCPU_REGS_RDI]) : + PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); + n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, + count); + if (n == 0) + n = 1; + rc->pos = rc->end = 0; + if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + return 0; + rc->end = n * size; } -done_prefixes: + memcpy(dest, rc->data + rc->pos, size); + rc->pos += size; + return 1; +} - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); - /* Opcode byte(s). */ - opcode = opcode_table[c->b]; - if (opcode.flags == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - opcode = twobyte_table[c->b]; - } - } - c->d = opcode.flags; + return desc->g ? (limit << 12) | 0xfff : limit; +} - if (c->d & Group) { - dual = c->d & GroupDual; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; +static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_ptr *dt) +{ + if (selector & 1 << 2) { + struct desc_struct desc; + memset (dt, 0, sizeof *dt); + if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + return; - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; + dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ + dt->address = get_desc_base(&desc); + } else + ops->get_gdt(dt, ctxt->vcpu); +} - c->d &= ~(Group | GroupDual); +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_struct *desc) +{ + struct desc_ptr dt; + u16 index = selector >> 3; + int ret; + u32 err; + ulong addr; - goffset = (c->modrm >> 3) & 7; + get_descriptor_table_ptr(ctxt, ops, selector, &dt); - if ((c->modrm >> 6) == 3) - opcode = g_mod3[goffset]; - else - opcode = g_mod012[goffset]; - c->d |= opcode.flags; + if (dt.size < index * 8 + 7) { + emulate_gp(ctxt, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; } + addr = dt.address + index * 8; + ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, addr, err); - c->execute = opcode.u.execute; + return ret; +} - /* Unrecognised? */ - if (c->d == 0 || (c->d & Undefined)) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } +/* allowed just for 8 bytes segments */ +static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_struct *desc) +{ + struct desc_ptr dt; + u16 index = selector >> 3; + u32 err; + ulong addr; + int ret; - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; + get_descriptor_table_ptr(ctxt, ops, selector, &dt); - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; + if (dt.size < index * 8 + 7) { + emulate_gp(ctxt, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); + addr = dt.address + index * 8; + ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, addr, err); - if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, ops, c); + return ret; +} - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, int seg) +{ + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - if (c->rip_relative) - c->modrm_ea += c->eip; + memset(&seg_desc, 0, sizeof seg_desc); - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: + if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) + || ctxt->mode == X86EMUL_MODE_REAL) { + /* set real mode segment descriptor */ + set_desc_base(&seg_desc, selector << 4); + set_desc_limit(&seg_desc, 0xffff); + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + goto load; + } + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + if (null_selector) /* for NULL selector skip all following checks */ + goto load; + + ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !seg_desc.s) + goto exception; + + if (!seg_desc.p) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = seg_desc.dpl; + cpl = ops->cpl(ctxt->vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) + goto exception; break; - case SrcReg: - decode_register_operand(&c->src, c, 0); + case VCPU_SREG_CS: + if (!(seg_desc.type & 8)) + goto exception; + + if (seg_desc.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: + case VCPU_SREG_TR: + if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (seg_desc.s || seg_desc.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - c->src.val = c->modrm_val; - c->src.ptr = c->modrm_ptr; - break; - } - c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; - c->src.val = 0; + if ((seg_desc.type & 0xa) == 0x8 || + (((seg_desc.type & 0xc) != 0xc) && + (rpl > dpl && cpl > dpl))) + goto exception; break; - case SrcImm: - case SrcImmU: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { + } + + if (seg_desc.s) { + /* mark segment as accessed */ + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } +load: + ops->set_segment_selector(selector, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + return X86EMUL_CONTINUE; +exception: + emulate_exception(ctxt, err_vec, err_code, true); + return X86EMUL_PROPAGATE_FAULT; +} + +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + u32 err; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { case 1: - c->src.val = insn_fetch(s8, 1, c->eip); + *(u8 *)c->dst.ptr = (u8)c->dst.val; break; case 2: - c->src.val = insn_fetch(s16, 2, c->eip); + *(u16 *)c->dst.ptr = (u16)c->dst.val; break; case 4: - c->src.val = insn_fetch(s32, 4, c->eip); + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; break; } - if ((c->d & SrcMask) == SrcImmU) { - switch (c->src.bytes) { - case 1: - c->src.val &= 0xff; - break; - case 2: - c->src.val &= 0xffff; - break; - case 4: - c->src.val &= 0xffffffff; - break; - } - } break; - case SrcImmByte: - case SrcImmUByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - if ((c->d & SrcMask) == SrcImmByte) - c->src.val = insn_fetch(s8, 1, c->eip); + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); else - c->src.val = insn_fetch(u8, 1, c->eip); - break; - case SrcAcc: - c->src.type = OP_REG; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->src.bytes) { - case 1: - c->src.val = *(u8 *)c->src.ptr; - break; - case 2: - c->src.val = *(u16 *)c->src.ptr; - break; - case 4: - c->src.val = *(u32 *)c->src.ptr; - break; - case 8: - c->src.val = *(u64 *)c->src.ptr; - break; - } - break; - case SrcOne: - c->src.bytes = 1; - c->src.val = 1; - break; - case SrcSI: - c->src.type = OP_MEM; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *) - register_address(c, seg_override_base(ctxt, ops, c), - c->regs[VCPU_REGS_RSI]); - c->src.val = 0; + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, + (unsigned long)c->dst.ptr, err); + if (rc != X86EMUL_CONTINUE) + return rc; break; - case SrcImmFAddr: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = c->op_bytes + 2; - insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + case OP_NONE: + /* no writeback */ break; - case SrcMemFAddr: - c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; - c->src.bytes = c->op_bytes + 2; + default: break; } + return X86EMUL_CONTINUE; +} - /* - * Decode and fetch the second source operand: register, memory - * or immediate. - */ - switch (c->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; - break; - case Src2ImmByte: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 1; - c->src2.val = insn_fetch(u8, 1, c->eip); - break; - case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; - break; - } +static inline void emulate_push(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - case DstMem64: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.type = OP_REG; - c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.ptr = c->modrm_ptr; - break; - } - c->dst.type = OP_MEM; - c->dst.ptr = (unsigned long *)c->modrm_ea; - if ((c->d & DstMask) == DstMem64) - c->dst.bytes = 8; - else - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), + c->regs[VCPU_REGS_RSP]); +} - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - break; - case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->dst.bytes) { - case 1: - c->dst.val = *(u8 *)c->dst.ptr; - break; - case 2: - c->dst.val = *(u16 *)c->dst.ptr; - break; - case 4: - c->dst.val = *(u32 *)c->dst.ptr; - break; - case 8: - c->dst.val = *(u64 *)c->dst.ptr; - break; - } - c->dst.orig_val = c->dst.val; - break; - case DstDI: - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *) - register_address(c, es_base(ctxt, ops), - c->regs[VCPU_REGS_RDI]); - c->dst.val = 0; - break; - } +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + struct decode_cache *c = &ctxt->decode; + int rc; -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), + c->regs[VCPU_REGS_RSP]), + dest, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); + return rc; } -static int read_emulated(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long addr, void *dest, unsigned size) +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) { int rc; - struct read_cache *mc = &ctxt->decode.mem_read; - u32 err; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = ops->cpl(ctxt->vcpu); - while (size) { - int n = min(size, 8u); - size -= n; - if (mc->pos < mc->end) - goto read_cached; + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; - rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, - ctxt->vcpu); - if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); - if (rc != X86EMUL_CONTINUE) - return rc; - mc->end += n; + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; - read_cached: - memcpy(dest, mc->data + mc->pos, n); - mc->pos += n; - dest += n; - addr += n; + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; } - return X86EMUL_CONTINUE; + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; } -static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned int size, unsigned short port, - void *dest) +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { - struct read_cache *rc = &ctxt->decode.io_read; + struct decode_cache *c = &ctxt->decode; - if (rc->pos == rc->end) { /* refill pio read ahead */ - struct decode_cache *c = &ctxt->decode; - unsigned int in_page, n; - unsigned int count = c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; - in_page = (ctxt->eflags & EFLG_DF) ? - offset_in_page(c->regs[VCPU_REGS_RDI]) : - PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); - n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, - count); - if (n == 0) - n = 1; - rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) - return 0; - rc->end = n * size; - } + c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - memcpy(dest, rc->data + rc->pos, size); - rc->pos += size; - return 1; + emulate_push(ctxt, ops); } -static u32 desc_limit_scaled(struct desc_struct *desc) +static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { - u32 limit = get_desc_limit(desc); + struct decode_cache *c = &ctxt->decode; + unsigned long selector; + int rc; - return desc->g ? (limit << 12) | 0xfff : limit; + rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); + return rc; } -static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, struct desc_ptr *dt) +static int emulate_pusha(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - if (selector & 1 << 2) { - struct desc_struct desc; - memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) - return; + struct decode_cache *c = &ctxt->decode; + unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + int rc = X86EMUL_CONTINUE; + int reg = VCPU_REGS_RAX; - dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ - dt->address = get_desc_base(&desc); - } else - ops->get_gdt(dt, ctxt->vcpu); -} + while (reg <= VCPU_REGS_RDI) { + (reg == VCPU_REGS_RSP) ? + (c->src.val = old_esp) : (c->src.val = c->regs[reg]); -/* allowed just for 8 bytes segments */ -static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, struct desc_struct *desc) -{ - struct desc_ptr dt; - u16 index = selector >> 3; - int ret; - u32 err; - ulong addr; + emulate_push(ctxt, ops); - get_descriptor_table_ptr(ctxt, ops, selector, &dt); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; - if (dt.size < index * 8 + 7) { - emulate_gp(ctxt, selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; + ++reg; } - addr = dt.address + index * 8; - ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); - if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); - return ret; + /* Disable writeback. */ + c->dst.type = OP_NONE; + + return rc; } -/* allowed just for 8 bytes segments */ -static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, struct desc_struct *desc) +static int emulate_popa(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - struct desc_ptr dt; - u16 index = selector >> 3; - u32 err; - ulong addr; - int ret; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int reg = VCPU_REGS_RDI; - get_descriptor_table_ptr(ctxt, ops, selector, &dt); + while (reg >= VCPU_REGS_RAX) { + if (reg == VCPU_REGS_RSP) { + register_address_increment(c, &c->regs[VCPU_REGS_RSP], + c->op_bytes); + --reg; + } - if (dt.size < index * 8 + 7) { - emulate_gp(ctxt, selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; + rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + if (rc != X86EMUL_CONTINUE) + break; + --reg; } - - addr = dt.address + index * 8; - ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); - if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); - - return ret; + return rc; } -static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - u16 selector, int seg) +static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { - struct desc_struct seg_desc; - u8 dpl, rpl, cpl; - unsigned err_vec = GP_VECTOR; - u32 err_code = 0; - bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ - int ret; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + unsigned long temp_eip = 0; + unsigned long temp_eflags = 0; + unsigned long cs = 0; + unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | + EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | + EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ + unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; - memset(&seg_desc, 0, sizeof seg_desc); + /* TODO: Add stack limit check */ - if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) - || ctxt->mode == X86EMUL_MODE_REAL) { - /* set real mode segment descriptor */ - set_desc_base(&seg_desc, selector << 4); - set_desc_limit(&seg_desc, 0xffff); - seg_desc.type = 3; - seg_desc.p = 1; - seg_desc.s = 1; - goto load; - } + rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) - && null_selector) - goto exception; + if (rc != X86EMUL_CONTINUE) + return rc; - /* TR should be in GDT only */ - if (seg == VCPU_SREG_TR && (selector & (1 << 2))) - goto exception; + if (temp_eip & ~0xffff) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } - if (null_selector) /* for NULL selector skip all following checks */ - goto load; + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + if (rc != X86EMUL_CONTINUE) + return rc; - err_code = selector & 0xfffc; - err_vec = GP_VECTOR; + rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); - /* can't load system descriptor into segment selecor */ - if (seg <= VCPU_SREG_GS && !seg_desc.s) - goto exception; + if (rc != X86EMUL_CONTINUE) + return rc; - if (!seg_desc.p) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); - rpl = selector & 3; - dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + return rc; - switch (seg) { - case VCPU_SREG_SS: - /* - * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL - */ - if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) - goto exception; - break; - case VCPU_SREG_CS: - if (!(seg_desc.type & 8)) - goto exception; + c->eip = temp_eip; - if (seg_desc.type & 4) { - /* conforming */ - if (dpl > cpl) - goto exception; - } else { - /* nonconforming */ - if (rpl > cpl || dpl != cpl) - goto exception; - } - /* CS(RPL) <- CPL */ - selector = (selector & 0xfffc) | cpl; - break; - case VCPU_SREG_TR: - if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) - goto exception; - break; - case VCPU_SREG_LDTR: - if (seg_desc.s || seg_desc.type != 2) - goto exception; - break; - default: /* DS, ES, FS, or GS */ - /* - * segment is not a data or readable code segment or - * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) - */ - if ((seg_desc.type & 0xa) == 0x8 || - (((seg_desc.type & 0xc) != 0xc) && - (rpl > dpl && cpl > dpl))) - goto exception; - break; - } - if (seg_desc.s) { - /* mark segment as accessed */ - seg_desc.type |= 1; - ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); - if (ret != X86EMUL_CONTINUE) - return ret; + if (c->op_bytes == 4) + ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); + else if (c->op_bytes == 2) { + ctxt->eflags &= ~0xffff; + ctxt->eflags |= temp_eflags; } -load: - ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); - return X86EMUL_CONTINUE; -exception: - emulate_exception(ctxt, err_vec, err_code, true); - return X86EMUL_PROPAGATE_FAULT; + + ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + + return rc; } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops* ops) { - int rc; - struct decode_cache *c = &ctxt->decode; - u32 err; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &err, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - &err, - ctxt->vcpu); - if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, - (unsigned long)c->dst.ptr, err); - if (rc != X86EMUL_CONTINUE) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_iret_real(ctxt, ops); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: default: - break; + /* iret from protected mode unimplemented yet */ + return X86EMUL_UNHANDLEABLE; } - return X86EMUL_CONTINUE; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt, +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), - c->regs[VCPU_REGS_RSP]); + return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); } -static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *dest, int len) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), - c->regs[VCPU_REGS_RSP]), - dest, len); - if (rc != X86EMUL_CONTINUE) - return rc; - - register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); - return rc; -} - -static int emulate_popf(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *dest, int len) -{ - int rc; - unsigned long val, change_mask; - int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt->vcpu); - - rc = emulate_pop(ctxt, ops, &val, len); - if (rc != X86EMUL_CONTINUE) - return rc; - - change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; - - switch(ctxt->mode) { - case X86EMUL_MODE_PROT64: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT16: - if (cpl == 0) - change_mask |= EFLG_IOPL; - if (cpl <= iopl) - change_mask |= EFLG_IF; - break; - case X86EMUL_MODE_VM86: - if (iopl < 3) { - emulate_gp(ctxt, 0); - return X86EMUL_PROPAGATE_FAULT; - } - change_mask |= EFLG_IF; - break; - default: /* real mode */ - change_mask |= (EFLG_IOPL | EFLG_IF); - break; - } - - *(unsigned long *)dest = - (ctxt->eflags & ~change_mask) | (val & change_mask); - - return rc; -} - -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) -{ - struct decode_cache *c = &ctxt->decode; - - c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - - emulate_push(ctxt, ops); -} - -static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) -{ - struct decode_cache *c = &ctxt->decode; - unsigned long selector; - int rc; - - rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); - return rc; -} - -static int emulate_pusha(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - unsigned long old_esp = c->regs[VCPU_REGS_RSP]; - int rc = X86EMUL_CONTINUE; - int reg = VCPU_REGS_RAX; - - while (reg <= VCPU_REGS_RDI) { - (reg == VCPU_REGS_RSP) ? - (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - - emulate_push(ctxt, ops); - - rc = writeback(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - ++reg; - } - - /* Disable writeback. */ - c->dst.type = OP_NONE; - - return rc; -} - -static int emulate_popa(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - int reg = VCPU_REGS_RDI; - - while (reg >= VCPU_REGS_RAX) { - if (reg == VCPU_REGS_RSP) { - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - c->op_bytes); - --reg; - } - - rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); - if (rc != X86EMUL_CONTINUE) - break; - --reg; - } - return rc; -} - -static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - unsigned long temp_eip = 0; - unsigned long temp_eflags = 0; - unsigned long cs = 0; - unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | - EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | - EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ - unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; - - /* TODO: Add stack limit check */ - - rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - if (temp_eip & ~0xffff) { - emulate_gp(ctxt, 0); - return X86EMUL_PROPAGATE_FAULT; - } - - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); - - if (rc != X86EMUL_CONTINUE) - return rc; - - rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); - - if (rc != X86EMUL_CONTINUE) - return rc; - - c->eip = temp_eip; - - - if (c->op_bytes == 4) - ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); - else if (c->op_bytes == 2) { - ctxt->eflags &= ~0xffff; - ctxt->eflags |= temp_eflags; - } - - ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ - ctxt->eflags |= EFLG_RESERVED_ONE_MASK; - - return rc; -} - -static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops* ops) -{ - switch(ctxt->mode) { - case X86EMUL_MODE_REAL: - return emulate_iret_real(ctxt, ops); - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - case X86EMUL_MODE_PROT32: - case X86EMUL_MODE_PROT64: - default: - /* iret from protected mode unimplemented yet */ - return X86EMUL_UNHANDLEABLE; - } -} - -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - - return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); -} - -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; switch (c->modrm_reg) { @@ -2624,6 +2252,378 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); } +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt) +{ + struct x86_emulate_ops *ops = ctxt->ops; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, dual, goffset; + struct opcode opcode, *g_mod012, *g_mod3; + + /* we cannot decode insn before we complete previous rep insn */ + WARN_ON(ctxt->restart); + + c->eip = ctxt->eip; + c->fetch.start = c->fetch.end = c->eip; + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix) + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + opcode = opcode_table[c->b]; + if (opcode.flags == 0) { + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + opcode = twobyte_table[c->b]; + } + } + c->d = opcode.flags; + + if (c->d & Group) { + dual = c->d & GroupDual; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + if (c->d & GroupDual) { + g_mod012 = opcode.u.gdual->mod012; + g_mod3 = opcode.u.gdual->mod3; + } else + g_mod012 = g_mod3 = opcode.u.group; + + c->d &= ~(Group | GroupDual); + + goffset = (c->modrm >> 3) & 7; + + if ((c->modrm >> 6) == 3) + opcode = g_mod3[goffset]; + else + opcode = g_mod012[goffset]; + c->d |= opcode.flags; + } + + c->execute = opcode.u.execute; + + /* Unrecognised? */ + if (c->d == 0 || (c->d & Undefined)) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) + rc = decode_modrm(ctxt, ops); + else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; + + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); + + if (!(!c->twobyte && c->b == 0x8d)) + c->modrm_ea += seg_override_base(ctxt, ops, c); + + if (c->ad_bytes != 8) + c->modrm_ea = (u32)c->modrm_ea; + + if (c->rip_relative) + c->modrm_ea += c->eip; + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + c->src.bytes = 2; + goto srcmem_common; + case SrcMem32: + c->src.bytes = 4; + goto srcmem_common; + case SrcMem: + c->src.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + break; + srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->src.type = OP_REG; + c->src.val = c->modrm_val; + c->src.ptr = c->modrm_ptr; + break; + } + c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.val = 0; + break; + case SrcImm: + case SrcImmU: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } + break; + case SrcImmByte: + case SrcImmUByte: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = 1; + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); + break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->src.bytes) { + case 1: + c->src.val = *(u8 *)c->src.ptr; + break; + case 2: + c->src.val = *(u16 *)c->src.ptr; + break; + case 4: + c->src.val = *(u32 *)c->src.ptr; + break; + case 8: + c->src.val = *(u64 *)c->src.ptr; + break; + } + break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + case SrcSI: + c->src.type = OP_MEM; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *) + register_address(c, seg_override_base(ctxt, ops, c), + c->regs[VCPU_REGS_RSI]); + c->src.val = 0; + break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.bytes = c->op_bytes + 2; + break; + } + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + return 0; + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstMem: + case DstMem64: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.type = OP_REG; + c->dst.val = c->dst.orig_val = c->modrm_val; + c->dst.ptr = c->modrm_ptr; + break; + } + c->dst.type = OP_MEM; + c->dst.ptr = (unsigned long *)c->modrm_ea; + if ((c->d & DstMask) == DstMem64) + c->dst.bytes = 8; + else + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } + break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->dst.bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + case 8: + c->dst.val = *(u64 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; + case DstDI: + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *) + register_address(c, es_base(ctxt, ops), + c->regs[VCPU_REGS_RDI]); + c->dst.val = 0; + break; + } + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { -- cgit v1.2.3 From 73fba5f4fe3e08bd7acb18a65b53643445c8f028 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:53 +0300 Subject: KVM: x86 emulator: move decode tables downwards So they can reference execution functions. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 472 ++++++++++++++++++++++++------------------------- 1 file changed, 236 insertions(+), 236 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c6f435917538..70a7cb49ff88 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -117,242 +117,6 @@ struct group_dual { struct opcode mod3[8]; }; -#define D(_y) { .flags = (_y) } -#define N D(0) -#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } -#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } - -static struct opcode group1[] = { - X7(D(Lock)), N -}; - -static struct opcode group1A[] = { - D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, -}; - -static struct opcode group3[] = { - D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(Undefined)), -}; - -static struct opcode group4[] = { - D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), - N, N, N, N, N, N, -}; - -static struct opcode group5[] = { - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - D(SrcMem | ModRM | Stack), N, - D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), - D(SrcMem | ModRM | Stack), N, -}; - -static struct group_dual group7 = { { - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), -}, { - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, -} }; - -static struct opcode group8[] = { - N, N, N, N, - D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), - D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), -}; - -static struct group_dual group9 = { { - N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, -}, { - N, N, N, N, N, N, N, N, -} }; - -static struct opcode opcode_table[256] = { - /* 0x00 - 0x07 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - /* 0x08 - 0x0F */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), - D(ImplicitOps | Stack | No64), N, - /* 0x10 - 0x17 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - /* 0x18 - 0x1F */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - /* 0x20 - 0x27 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, - /* 0x28 - 0x2F */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, - /* 0x30 - 0x37 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, - /* 0x38 - 0x3F */ - D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), - N, N, - /* 0x40 - 0x4F */ - X16(D(DstReg)), - /* 0x50 - 0x57 */ - X8(D(SrcReg | Stack)), - /* 0x58 - 0x5F */ - X8(D(DstReg | Stack)), - /* 0x60 - 0x67 */ - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), - N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , - N, N, N, N, - /* 0x68 - 0x6F */ - D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N, - D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ - D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ - /* 0x70 - 0x7F */ - X16(D(SrcImmByte)), - /* 0x80 - 0x87 */ - G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), - G(DstMem | SrcImm | ModRM | Group, group1), - G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), - G(DstMem | SrcImmByte | ModRM | Group, group1), - D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - /* 0x88 - 0x8F */ - D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov), - D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), - D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), - /* 0x90 - 0x97 */ - D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), - /* 0x98 - 0x9F */ - N, N, D(SrcImmFAddr | No64), N, - D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, - /* 0xA0 - 0xA7 */ - D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), - D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs), - D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String), - D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String), - /* 0xA8 - 0xAF */ - D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String), - D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String), - D(ByteOp | DstDI | String), D(DstDI | String), - /* 0xB0 - 0xB7 */ - X8(D(ByteOp | DstReg | SrcImm | Mov)), - /* 0xB8 - 0xBF */ - X8(D(DstReg | SrcImm | Mov)), - /* 0xC0 - 0xC7 */ - D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM), - N, D(ImplicitOps | Stack), N, N, - D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov), - /* 0xC8 - 0xCF */ - N, N, N, D(ImplicitOps | Stack), - D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), - /* 0xD0 - 0xD7 */ - D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), - D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), - N, N, N, N, - /* 0xD8 - 0xDF */ - N, N, N, N, N, N, N, N, - /* 0xE0 - 0xE7 */ - N, N, N, N, - D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), - D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), - /* 0xE8 - 0xEF */ - D(SrcImm | Stack), D(SrcImm | ImplicitOps), - D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), - D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), - /* 0xF0 - 0xF7 */ - N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), - /* 0xF8 - 0xFF */ - D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), - D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), -}; - -static struct opcode twobyte_table[256] = { - /* 0x00 - 0x0F */ - N, GD(0, &group7), N, N, - N, D(ImplicitOps), D(ImplicitOps | Priv), N, - D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, - N, D(ImplicitOps | ModRM), N, N, - /* 0x10 - 0x1F */ - N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, - /* 0x20 - 0x2F */ - D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), - D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), - N, N, N, N, - N, N, N, N, N, N, N, N, - /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N, - D(ImplicitOps), D(ImplicitOps | Priv), N, N, - N, N, N, N, N, N, N, N, - /* 0x40 - 0x4F */ - X16(D(DstReg | SrcMem | ModRM | Mov)), - /* 0x50 - 0x5F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x60 - 0x6F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x70 - 0x7F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0x80 - 0x8F */ - X16(D(SrcImm)), - /* 0x90 - 0x9F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xA0 - 0xA7 */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), N, N, - /* 0xA8 - 0xAF */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), N, - /* 0xB0 - 0xB7 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), - N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), - D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xB8 - 0xBF */ - N, N, - G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), - N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), - D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ - N, N, N, D(DstMem | SrcReg | ModRM | Mov), - N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, - /* 0xD0 - 0xDF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xE0 - 0xEF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, - /* 0xF0 - 0xFF */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N -}; - -#undef D -#undef N -#undef G -#undef GD -#undef I - /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -2252,6 +2016,242 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); } +#define D(_y) { .flags = (_y) } +#define N D(0) +#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } + +static struct opcode group1[] = { + X7(D(Lock)), N +}; + +static struct opcode group1A[] = { + D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, +}; + +static struct opcode group3[] = { + D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + X4(D(Undefined)), +}; + +static struct opcode group4[] = { + D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + N, N, N, N, N, N, +}; + +static struct opcode group5[] = { + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + D(SrcMem | ModRM | Stack), N, + D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), + D(SrcMem | ModRM | Stack), N, +}; + +static struct group_dual group7 = { { + N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), +}, { + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, +} }; + +static struct opcode group8[] = { + N, N, N, N, + D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), +}; + +static struct group_dual group9 = { { + N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, +}, { + N, N, N, N, N, N, N, N, +} }; + +static struct opcode opcode_table[256] = { + /* 0x00 - 0x07 */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x08 - 0x0F */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), N, + /* 0x10 - 0x17 */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x18 - 0x1F */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x20 - 0x27 */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, + /* 0x28 - 0x2F */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, + /* 0x30 - 0x37 */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, + /* 0x38 - 0x3F */ + D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg)), + /* 0x50 - 0x57 */ + X8(D(SrcReg | Stack)), + /* 0x58 - 0x5F */ + X8(D(DstReg | Stack)), + /* 0x60 - 0x67 */ + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, N, N, N, + /* 0x68 - 0x6F */ + D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N, + D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ + D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + /* 0x70 - 0x7F */ + X16(D(SrcImmByte)), + /* 0x80 - 0x87 */ + G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), + G(DstMem | SrcImm | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), + G(DstMem | SrcImmByte | ModRM | Group, group1), + D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + /* 0x88 - 0x8F */ + D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov), + D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), + D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), + /* 0x90 - 0x97 */ + D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), + /* 0x98 - 0x9F */ + N, N, D(SrcImmFAddr | No64), N, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + /* 0xA0 - 0xA7 */ + D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), + D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs), + D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String), + D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String), + /* 0xA8 - 0xAF */ + D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String), + D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String), + D(ByteOp | DstDI | String), D(DstDI | String), + /* 0xB0 - 0xB7 */ + X8(D(ByteOp | DstReg | SrcImm | Mov)), + /* 0xB8 - 0xBF */ + X8(D(DstReg | SrcImm | Mov)), + /* 0xC0 - 0xC7 */ + D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM), + N, D(ImplicitOps | Stack), N, N, + D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov), + /* 0xC8 - 0xCF */ + N, N, N, D(ImplicitOps | Stack), + D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + /* 0xD0 - 0xD7 */ + D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), + D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), + N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, + D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), + D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), + /* 0xE8 - 0xEF */ + D(SrcImm | Stack), D(SrcImm | ImplicitOps), + D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), + D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), + D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), + /* 0xF0 - 0xF7 */ + N, N, N, N, + D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + /* 0xF8 - 0xFF */ + D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), +}; + +static struct opcode twobyte_table[256] = { + /* 0x00 - 0x0F */ + N, GD(0, &group7), N, N, + N, D(ImplicitOps), D(ImplicitOps | Priv), N, + D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + N, D(ImplicitOps | ModRM), N, N, + /* 0x10 - 0x1F */ + N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, + /* 0x20 - 0x2F */ + D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), + D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), + N, N, N, N, + N, N, N, N, N, N, N, N, + /* 0x30 - 0x3F */ + D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N, + D(ImplicitOps), D(ImplicitOps | Priv), N, N, + N, N, N, N, N, N, N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg | SrcMem | ModRM | Mov)), + /* 0x50 - 0x5F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x60 - 0x6F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x70 - 0x7F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x80 - 0x8F */ + X16(D(SrcImm)), + /* 0x90 - 0x9F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xA0 - 0xA7 */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), N, N, + /* 0xA8 - 0xAF */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), + D(ModRM), N, + /* 0xB0 - 0xB7 */ + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), + D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xB8 - 0xBF */ + N, N, + G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), + D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xC0 - 0xCF */ + N, N, N, D(DstMem | SrcReg | ModRM | Mov), + N, N, N, GD(0, &group9), + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xDF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xE0 - 0xEF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xF0 - 0xFF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N +}; + +#undef D +#undef N +#undef G +#undef GD +#undef I + int x86_decode_insn(struct x86_emulate_ctxt *ctxt) { -- cgit v1.2.3 From d0e533255d3811382c97b594ff7ab19b9b036814 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:54 +0300 Subject: KVM: x86 emulator: allow repeat macro arguments to contain commas Needed for repeating instructions with execution functions. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 70a7cb49ff88..7e9bcda3937e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -94,14 +94,14 @@ #define Src2One (3<<29) #define Src2Mask (7<<29) -#define X2(x) x, x -#define X3(x) X2(x), x -#define X4(x) X2(x), X2(x) -#define X5(x) X4(x), x -#define X6(x) X4(x), X2(x) -#define X7(x) X4(x), X3(x) -#define X8(x) X4(x), X4(x) -#define X16(x) X8(x), X8(x) +#define X2(x...) x, x +#define X3(x...) X2(x), x +#define X4(x...) X2(x), X2(x) +#define X5(x...) X4(x), x +#define X6(x...) X4(x), X2(x) +#define X7(x...) X4(x), X3(x) +#define X8(x...) X4(x), X4(x) +#define X16(x...) X8(x), X8(x) struct opcode { u32 flags; -- cgit v1.2.3 From 63540382ccb83d2857964858c1ac7eb7d37de497 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 Jul 2010 15:11:55 +0300 Subject: KVM: x86 emulator: convert some push instructions to direct decode Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e9bcda3937e..904fc1c99b97 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2016,6 +2016,12 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); } +static int em_push(struct x86_emulate_ctxt *ctxt) +{ + emulate_push(ctxt, ctxt->ops); + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2111,7 +2117,7 @@ static struct opcode opcode_table[256] = { /* 0x40 - 0x4F */ X16(D(DstReg)), /* 0x50 - 0x57 */ - X8(D(SrcReg | Stack)), + X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ X8(D(DstReg | Stack)), /* 0x60 - 0x67 */ @@ -2119,7 +2125,8 @@ static struct opcode opcode_table[256] = { N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , N, N, N, N, /* 0x68 - 0x6F */ - D(SrcImm | Mov | Stack), N, D(SrcImmByte | Mov | Stack), N, + I(SrcImm | Mov | Stack, em_push), N, + I(SrcImmByte | Mov | Stack, em_push), N, D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ @@ -2786,9 +2793,6 @@ special_insn: case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt, ops); - break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); @@ -2810,10 +2814,6 @@ special_insn: goto cannot_emulate; c->dst.val = (s32) c->src.val; break; - case 0x68: /* push imm */ - case 0x6a: /* push imm8 */ - emulate_push(ctxt, ops); - break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ c->dst.bytes = min(c->dst.bytes, 4u); -- cgit v1.2.3 From e85d28f8e8cef09b8e424448ccedb7244cfbf147 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 29 Jul 2010 15:11:52 +0300 Subject: KVM: x86 emulator: don't update vcpu state if instruction is restarted No need to update vcpu state since instruction is in the middle of the emulation. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33deb75f16ee..3cbe8032394a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4057,32 +4057,27 @@ restart: return handle_emulation_failure(vcpu); } - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + r = EMULATE_DONE; - if (vcpu->arch.emulate_ctxt.exception >= 0) { + if (vcpu->arch.emulate_ctxt.exception >= 0) inject_emulated_exception(vcpu); - return EMULATE_DONE; - } - - if (vcpu->arch.pio.count) { + else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->mmio_needed) { + r = EMULATE_DO_MMIO; + } else if (vcpu->mmio_needed) { if (vcpu->mmio_is_write) vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->arch.emulate_ctxt.restart) + r = EMULATE_DO_MMIO; + } else if (vcpu->arch.emulate_ctxt.restart) goto restart; - return EMULATE_DONE; + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + + return r; } EXPORT_SYMBOL_GPL(emulate_instruction); -- cgit v1.2.3 From 9928ff608b1b6ba10fafde85f57970a83a181331 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 18:35:24 +0300 Subject: KVM: x86 emulator: fix LMSW able to clear cr0.pe LMSW is documented not to be able to clear cr0.pe; make it so. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 904fc1c99b97..4d49514a919e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3211,7 +3211,7 @@ twobyte_insn: c->dst.val = ops->get_cr(0, ctxt->vcpu); break; case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | + ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | (c->src.val & 0x0f), ctxt->vcpu); c->dst.type = OP_NONE; break; -- cgit v1.2.3 From 4fc40f076f4fa289dd546990b597351c9cdad985 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 2 Aug 2010 12:47:51 +0300 Subject: KVM: x86 emulator: check io permissions only once for string pio Do not recheck io permission on every iteration. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 6 ++++++ arch/x86/kvm/x86.c | 1 + 3 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0f901c16cf1c..8762411fe9bb 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -224,6 +224,7 @@ struct x86_emulate_ctxt { int interruptibility; bool restart; /* restart string instruction after writeback */ + bool perm_ok; /* do not check permissions if true */ int exception; /* exception that happens during emulation or -1 */ u32 error_code; /* error code for exception */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4d49514a919e..760e2b030e68 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1621,9 +1621,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { + if (ctxt->perm_ok) + return true; + if (emulator_bad_iopl(ctxt, ops)) if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) return false; + + ctxt->perm_ok = true; + return true; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3cbe8032394a..35c0f4e4a621 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3997,6 +3997,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); vcpu->arch.emulate_ctxt.interruptibility = 0; vcpu->arch.emulate_ctxt.exception = -1; + vcpu->arch.emulate_ctxt.perm_ok = false; r = x86_decode_insn(&vcpu->arch.emulate_ctxt); trace_kvm_emulate_insn_start(vcpu); -- cgit v1.2.3 From 251464c464cf7df7d6d548f1065f49a3ecd08118 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 2 Aug 2010 16:12:08 +0800 Subject: KVM: MMU: using kvm_set_pfn_accessed() instead of mark_page_accessed() It's a small cleanup that using using kvm_set_pfn_accessed() instead of mark_page_accessed() Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 82f7622c17d3..e430a383ad15 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -309,7 +309,7 @@ static void update_spte(u64 *sptep, u64 new_spte) else { old_spte = __xchg_spte(sptep, new_spte); if (old_spte & shadow_accessed_mask) - mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } } -- cgit v1.2.3 From 8672b7217a234c41d425a63b171af809e1169842 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 2 Aug 2010 16:14:04 +0800 Subject: KVM: MMU: move bits lost judgement into a separate function Introduce spte_has_volatile_bits() function to judge whether spte bits will miss, it's more readable and can help us to cleanup code later Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e430a383ad15..c07b9a200bc8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -299,6 +299,20 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) #endif } +static bool spte_has_volatile_bits(u64 spte) +{ + if (!shadow_accessed_mask) + return false; + + if (!is_shadow_present_pte(spte)) + return false; + + if (spte & shadow_accessed_mask) + return false; + + return true; +} + static void update_spte(u64 *sptep, u64 new_spte) { u64 old_spte; @@ -679,14 +693,14 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte) pfn_t pfn; u64 old_spte = *sptep; - if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || - old_spte & shadow_accessed_mask) { + if (!spte_has_volatile_bits(old_spte)) __set_spte(sptep, new_spte); - } else + else old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) return; + pfn = spte_to_pfn(old_spte); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); -- cgit v1.2.3 From 4132779b1718f066ec2d06a71c8958039865cd49 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 2 Aug 2010 16:15:08 +0800 Subject: KVM: MMU: mark page dirty only when page is really written Mark page dirty only when this page is really written, it's more exacter, and also can fix dirty page marking in speculation path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c07b9a200bc8..ff95d418750d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -307,24 +307,42 @@ static bool spte_has_volatile_bits(u64 spte) if (!is_shadow_present_pte(spte)) return false; - if (spte & shadow_accessed_mask) + if ((spte & shadow_accessed_mask) && + (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) return false; return true; } +static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) && !(new_spte & bit_mask); +} + static void update_spte(u64 *sptep, u64 new_spte) { - u64 old_spte; + u64 mask, old_spte = *sptep; + + WARN_ON(!is_rmap_spte(new_spte)); - if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || - !is_rmap_spte(*sptep)) + new_spte |= old_spte & shadow_dirty_mask; + + mask = shadow_accessed_mask; + if (is_writable_pte(old_spte)) + mask |= shadow_dirty_mask; + + if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) __set_spte(sptep, new_spte); - else { + else old_spte = __xchg_spte(sptep, new_spte); - if (old_spte & shadow_accessed_mask) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - } + + if (!shadow_accessed_mask) + return; + + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -704,7 +722,7 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte) pfn = spte_to_pfn(old_spte); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writable_pte(old_spte)) + if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) kvm_set_pfn_dirty(pfn); } @@ -759,13 +777,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) } spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) { - pfn_t pfn; - - spte = rmap_next(kvm, rmapp, NULL); - pfn = spte_to_pfn(*spte); - kvm_set_pfn_dirty(pfn); - } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; @@ -1938,7 +1949,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * whether the guest actually used the pte (in order to detect * demand paging). */ - spte = shadow_base_present_pte | shadow_dirty_mask; + spte = shadow_base_present_pte; if (!speculative) spte |= shadow_accessed_mask; if (!dirty) @@ -1999,8 +2010,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - if (is_writable_pte(*sptep) && !is_writable_pte(spte)) - kvm_set_pfn_dirty(pfn); update_spte(sptep, spte); done: return ret; -- cgit v1.2.3 From 52c65a30a5c6f31cd66dba57c22d18cafa5e327f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 2 Aug 2010 16:46:44 +0200 Subject: KVM: SVM: Check for nested vmrun intercept before emulating vmrun This patch lets the nested vmrun fail if the L1 hypervisor has not intercepted vmrun. This fixes the "vmrun intercept check" unit test. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 116e0341bf4c..a0e5c7e26104 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2014,6 +2014,14 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool nested_vmcb_checks(struct vmcb *vmcb) +{ + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + return false; + + return true; +} + static bool nested_svm_vmrun(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -2028,6 +2036,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) if (!nested_vmcb) return false; + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, -- cgit v1.2.3 From dbe7758482a870f30a86bdeefebf4fc260afef11 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 2 Aug 2010 16:46:45 +0200 Subject: KVM: SVM: Check for asid != 0 on nested vmrun This patch lets a nested vmrun fail if the L1 hypervisor left the asid zero. This fixes the asid_zero unit test. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a0e5c7e26104..af5b9ea51965 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2019,6 +2019,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) return false; + if (vmcb->control.asid == 0) + return false; + return true; } -- cgit v1.2.3 From 09ee57cdae3156aa3b74f378a0c57ef657c90f38 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 12:07:29 +0300 Subject: KVM: x86 emulator: push segment override out of decode_modrm() Let it compute modrm_seg instead, and have the caller apply it. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 8762411fe9bb..cbdf76722d7d 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -198,6 +198,7 @@ struct decode_cache { u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; + u8 modrm_seg; u8 use_modrm_ea; bool rip_relative; unsigned long modrm_ea; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 760e2b030e68..471f12ae29cf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -593,6 +593,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_rm |= (c->modrm & 0x07); c->modrm_ea = 0; c->use_modrm_ea = 1; + c->modrm_seg = VCPU_SREG_DS; if (c->modrm_mod == 3) { c->modrm_ptr = decode_register(c->modrm_rm, @@ -649,8 +650,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_SS); + c->modrm_seg = VCPU_SREG_SS; c->modrm_ea = (u16)c->modrm_ea; } else { /* 32/64-bit ModR/M decode. */ @@ -2405,9 +2405,11 @@ done_prefixes: c->op_bytes = 8; /* ModRM and SIB bytes. */ - if (c->d & ModRM) + if (c->d & ModRM) { rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) + if (!c->has_seg_override) + set_seg_override(c, c->modrm_seg); + } else if (c->d & MemAbs) rc = decode_abs(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; -- cgit v1.2.3 From 1a6440aef6d63252e6c80aff651147b5f8c737e9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 12:35:10 +0300 Subject: KVM: x86 emulator: use correct type for memory address in operands Currently we use a void pointer for memory addresses. That's wrong since these are guest virtual addresses which are not directly dereferencable by the host. Use the correct type, unsigned long. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 5 +- arch/x86/kvm/emulate.c | 117 ++++++++++++++++++------------------- 2 files changed, 61 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index cbdf76722d7d..0c835f7eb308 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -156,7 +156,10 @@ struct operand { unsigned long orig_val; u64 orig_val64; }; - unsigned long *ptr; + union { + unsigned long *reg; + unsigned long mem; + } addr; union { unsigned long val; u64 val64; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 471f12ae29cf..5f45f66ed277 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -489,7 +489,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - void *ptr, + ulong addr, u16 *size, unsigned long *address, int op_bytes) { int rc; @@ -497,12 +497,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu, NULL); + rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu, NULL); + rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); return rc; } @@ -552,21 +550,21 @@ static void decode_register_operand(struct operand *op, reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; + op->addr.reg = decode_register(reg, c->regs, highbyte_regs); + op->val = *(u8 *)op->addr.reg; op->bytes = 1; } else { - op->ptr = decode_register(reg, c->regs, 0); + op->addr.reg = decode_register(reg, c->regs, 0); op->bytes = c->op_bytes; switch (op->bytes) { case 2: - op->val = *(u16 *)op->ptr; + op->val = *(u16 *)op->addr.reg; break; case 4: - op->val = *(u32 *)op->ptr; + op->val = *(u32 *)op->addr.reg; break; case 8: - op->val = *(u64 *) op->ptr; + op->val = *(u64 *) op->addr.reg; break; } } @@ -976,23 +974,23 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, */ switch (c->dst.bytes) { case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; + *(u8 *)c->dst.addr.reg = (u8)c->dst.val; break; case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; + *(u16 *)c->dst.addr.reg = (u16)c->dst.val; break; case 4: - *c->dst.ptr = (u32)c->dst.val; + *c->dst.addr.reg = (u32)c->dst.val; break; /* 64b: zero-ext */ case 8: - *c->dst.ptr = c->dst.val; + *c->dst.addr.reg = c->dst.val; break; } break; case OP_MEM: if (c->lock_prefix) rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, + c->dst.addr.mem, &c->dst.orig_val, &c->dst.val, c->dst.bytes, @@ -1000,14 +998,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, ctxt->vcpu); else rc = ops->write_emulated( - (unsigned long)c->dst.ptr, + c->dst.addr.mem, &c->dst.val, c->dst.bytes, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, - (unsigned long)c->dst.ptr, err); + emulate_pf(ctxt, c->dst.addr.mem, err); if (rc != X86EMUL_CONTINUE) return rc; break; @@ -1029,8 +1026,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), - c->regs[VCPU_REGS_RSP]); + c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), + c->regs[VCPU_REGS_RSP]); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -2019,7 +2016,7 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; register_address_increment(c, &c->regs[reg], df * op->bytes); - op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); + op->addr.mem = register_address(c, base, c->regs[reg]); } static int em_push(struct x86_emulate_ctxt *ctxt) @@ -2456,17 +2453,17 @@ done_prefixes: if ((c->d & ModRM) && c->modrm_mod == 3) { c->src.type = OP_REG; c->src.val = c->modrm_val; - c->src.ptr = c->modrm_ptr; + c->src.addr.reg = c->modrm_ptr; break; } c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.addr.mem = c->modrm_ea; c->src.val = 0; break; case SrcImm: case SrcImmU: c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; + c->src.addr.mem = c->eip; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; if (c->src.bytes == 8) c->src.bytes = 4; @@ -2499,7 +2496,7 @@ done_prefixes: case SrcImmByte: case SrcImmUByte: c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; + c->src.addr.mem = c->eip; c->src.bytes = 1; if ((c->d & SrcMask) == SrcImmByte) c->src.val = insn_fetch(s8, 1, c->eip); @@ -2509,19 +2506,19 @@ done_prefixes: case SrcAcc: c->src.type = OP_REG; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = &c->regs[VCPU_REGS_RAX]; + c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; switch (c->src.bytes) { case 1: - c->src.val = *(u8 *)c->src.ptr; + c->src.val = *(u8 *)c->src.addr.reg; break; case 2: - c->src.val = *(u16 *)c->src.ptr; + c->src.val = *(u16 *)c->src.addr.reg; break; case 4: - c->src.val = *(u32 *)c->src.ptr; + c->src.val = *(u32 *)c->src.addr.reg; break; case 8: - c->src.val = *(u64 *)c->src.ptr; + c->src.val = *(u64 *)c->src.addr.reg; break; } break; @@ -2532,20 +2529,20 @@ done_prefixes: case SrcSI: c->src.type = OP_MEM; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *) + c->src.addr.mem = register_address(c, seg_override_base(ctxt, ops, c), c->regs[VCPU_REGS_RSI]); c->src.val = 0; break; case SrcImmFAddr: c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; + c->src.addr.mem = c->eip; c->src.bytes = c->op_bytes + 2; insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); break; case SrcMemFAddr: c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.addr.mem = c->modrm_ea; c->src.bytes = c->op_bytes + 2; break; } @@ -2563,7 +2560,7 @@ done_prefixes: break; case Src2ImmByte: c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; + c->src2.addr.mem = c->eip; c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; @@ -2588,11 +2585,11 @@ done_prefixes: c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.type = OP_REG; c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.ptr = c->modrm_ptr; + c->dst.addr.reg = c->modrm_ptr; break; } c->dst.type = OP_MEM; - c->dst.ptr = (unsigned long *)c->modrm_ea; + c->dst.addr.mem = c->modrm_ea; if ((c->d & DstMask) == DstMem64) c->dst.bytes = 8; else @@ -2601,26 +2598,26 @@ done_prefixes: if (c->d & BitOp) { unsigned long mask = ~(c->dst.bytes * 8 - 1); - c->dst.ptr = (void *)c->dst.ptr + + c->dst.addr.mem = c->dst.addr.mem + (c->src.val & mask) / 8; } break; case DstAcc: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; switch (c->dst.bytes) { case 1: - c->dst.val = *(u8 *)c->dst.ptr; + c->dst.val = *(u8 *)c->dst.addr.reg; break; case 2: - c->dst.val = *(u16 *)c->dst.ptr; + c->dst.val = *(u16 *)c->dst.addr.reg; break; case 4: - c->dst.val = *(u32 *)c->dst.ptr; + c->dst.val = *(u32 *)c->dst.addr.reg; break; case 8: - c->dst.val = *(u64 *)c->dst.ptr; + c->dst.val = *(u64 *)c->dst.addr.reg; break; } c->dst.orig_val = c->dst.val; @@ -2628,7 +2625,7 @@ done_prefixes: case DstDI: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *) + c->dst.addr.mem = register_address(c, es_base(ctxt, ops), c->regs[VCPU_REGS_RDI]); c->dst.val = 0; @@ -2696,7 +2693,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (c->src.type == OP_MEM) { - rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, + rc = read_emulated(ctxt, ops, c->src.addr.mem, c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2704,7 +2701,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, + rc = read_emulated(ctxt, ops, c->src2.addr.mem, &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2716,7 +2713,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, + rc = read_emulated(ctxt, ops, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2880,16 +2877,16 @@ special_insn: /* Write back the register source. */ switch (c->dst.bytes) { case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; + *(u8 *) c->src.addr.reg = (u8) c->dst.val; break; case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; + *(u16 *) c->src.addr.reg = (u16) c->dst.val; break; case 4: - *c->src.ptr = (u32) c->dst.val; + *c->src.addr.reg = (u32) c->dst.val; break; /* 64b reg: zero-extend */ case 8: - *c->src.ptr = c->dst.val; + *c->src.addr.reg = c->dst.val; break; } /* @@ -2936,15 +2933,15 @@ special_insn: goto done; break; case 0x90: /* nop / xchg r8,rax */ - if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { + if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) { c->dst.type = OP_NONE; /* nop */ break; } case 0x91 ... 0x97: /* xchg reg,rax */ c->src.type = OP_REG; c->src.bytes = c->op_bytes; - c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; - c->src.val = *(c->src.ptr); + c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; + c->src.val = *(c->src.addr.reg); goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; @@ -2952,7 +2949,7 @@ special_insn: break; case 0x9d: /* popf */ c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.addr.reg = &ctxt->eflags; c->dst.bytes = c->op_bytes; rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2963,7 +2960,7 @@ special_insn: goto mov; case 0xa6 ... 0xa7: /* cmps */ c->dst.type = OP_NONE; /* Disable writeback. */ - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; @@ -2982,7 +2979,7 @@ special_insn: break; case 0xc3: /* ret */ c->dst.type = OP_REG; - c->dst.ptr = &c->eip; + c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ @@ -3184,7 +3181,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, + rc = read_descriptor(ctxt, ops, c->src.addr.mem, &size, &address, c->op_bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -3204,7 +3201,7 @@ twobyte_insn: goto cannot_emulate; } } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, + rc = read_descriptor(ctxt, ops, c->src.addr.mem, &size, &address, c->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3399,7 +3396,7 @@ twobyte_insn: } else { /* Failure: write the value we saw to EAX. */ c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; } break; case 0xb3: -- cgit v1.2.3 From 4515453964e78ce556a98c56aeb675ed8d48b8de Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 12:39:53 +0300 Subject: KVM: x86 emulator: simplify xchg decode tables Use X8() to avoid repetition. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5f45f66ed277..c7176df9ced5 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2147,7 +2147,7 @@ static struct opcode opcode_table[256] = { D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ - D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), D(DstReg), + X8(D(DstReg)), /* 0x98 - 0x9F */ N, N, D(SrcImmFAddr | No64), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, -- cgit v1.2.3 From 3d9e77dff81c8be21ec0e7950ae06d1bddff8066 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 12:41:59 +0300 Subject: KVM: x86 emulator: use SrcAcc to simplify xchg decoding Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c7176df9ced5..b7da0e3e0cc0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2147,7 +2147,7 @@ static struct opcode opcode_table[256] = { D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ - X8(D(DstReg)), + X8(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ N, N, D(SrcImmFAddr | No64), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, @@ -2932,16 +2932,9 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; - case 0x90: /* nop / xchg r8,rax */ - if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) { - c->dst.type = OP_NONE; /* nop */ - break; - } - case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = OP_REG; - c->src.bytes = c->op_bytes; - c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; - c->src.val = *(c->src.addr.reg); + case 0x90 ... 0x97: /* nop / xchg reg, rax */ + if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) + goto done; goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; -- cgit v1.2.3 From 91ff3cb43cb3dd8810d726dfa1f3736dc9aea1df Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 12:53:09 +0300 Subject: KVM: x86 emulator: put register operand fetch into a function The code is repeated three times, put it into fetch_register_operand() Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 61 +++++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b7da0e3e0cc0..898a55ba3e14 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -539,6 +539,24 @@ static int test_cc(unsigned int condition, unsigned int flags) return (!!rc ^ (condition & 1)); } +static void fetch_register_operand(struct operand *op) +{ + switch (op->bytes) { + case 1: + op->val = *(u8 *)op->addr.reg; + break; + case 2: + op->val = *(u16 *)op->addr.reg; + break; + case 4: + op->val = *(u32 *)op->addr.reg; + break; + case 8: + op->val = *(u64 *)op->addr.reg; + break; + } +} + static void decode_register_operand(struct operand *op, struct decode_cache *c, int inhibit_bytereg) @@ -551,23 +569,12 @@ static void decode_register_operand(struct operand *op, op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { op->addr.reg = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->addr.reg; op->bytes = 1; } else { op->addr.reg = decode_register(reg, c->regs, 0); op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->addr.reg; - break; - case 4: - op->val = *(u32 *)op->addr.reg; - break; - case 8: - op->val = *(u64 *) op->addr.reg; - break; - } } + fetch_register_operand(op); op->orig_val = op->val; } @@ -2507,20 +2514,7 @@ done_prefixes: c->src.type = OP_REG; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; - switch (c->src.bytes) { - case 1: - c->src.val = *(u8 *)c->src.addr.reg; - break; - case 2: - c->src.val = *(u16 *)c->src.addr.reg; - break; - case 4: - c->src.val = *(u32 *)c->src.addr.reg; - break; - case 8: - c->src.val = *(u64 *)c->src.addr.reg; - break; - } + fetch_register_operand(&c->src); break; case SrcOne: c->src.bytes = 1; @@ -2606,20 +2600,7 @@ done_prefixes: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; - switch (c->dst.bytes) { - case 1: - c->dst.val = *(u8 *)c->dst.addr.reg; - break; - case 2: - c->dst.val = *(u16 *)c->dst.addr.reg; - break; - case 4: - c->dst.val = *(u32 *)c->dst.addr.reg; - break; - case 8: - c->dst.val = *(u64 *)c->dst.addr.reg; - break; - } + fetch_register_operand(&c->dst); c->dst.orig_val = c->dst.val; break; case DstDI: -- cgit v1.2.3 From d4709c78eeff2b272e0b9727748b72371b0e71ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 13:53:19 +0300 Subject: KVM: x86 emulator: drop use_modrm_ea Unused (and has never been). Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 - arch/x86/kvm/emulate.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0c835f7eb308..e425444658e8 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -202,7 +202,6 @@ struct decode_cache { u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; - u8 use_modrm_ea; bool rip_relative; unsigned long modrm_ea; void *modrm_ptr; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 898a55ba3e14..7d2c715f1a2a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -597,7 +597,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_reg |= (c->modrm & 0x38) >> 3; c->modrm_rm |= (c->modrm & 0x07); c->modrm_ea = 0; - c->use_modrm_ea = 1; c->modrm_seg = VCPU_SREG_DS; if (c->modrm_mod == 3) { -- cgit v1.2.3 From 1e87e3efe764285133866a14ddc71cf211f022c2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 14:42:51 +0300 Subject: KVM: x86 emulator: simplify REX.W check (x && (x & y)) == (x & y) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7d2c715f1a2a..a832019138f3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2358,9 +2358,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt) done_prefixes: /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ /* Opcode byte(s). */ opcode = opcode_table[c->b]; -- cgit v1.2.3 From 7f9b4b75be866de938a3094413a60554f7e66e4d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 14:46:54 +0300 Subject: KVM: x86 emulator: introduce Op3264 for mov cr and mov dr instructions The operands for these instructions are 32 bits or 64 bits, depending on long mode, and ignoring REX prefixes, or the operand size prefix. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a832019138f3..b7adfcc2f74f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -83,6 +83,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ /* Misc flags */ +#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ @@ -2406,6 +2407,13 @@ done_prefixes: if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) c->op_bytes = 8; + if (c->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + c->op_bytes = 8; + else + c->op_bytes = 4; + } + /* ModRM and SIB bytes. */ if (c->d & ModRM) { rc = decode_modrm(ctxt, ops); -- cgit v1.2.3 From cecc9e39161898eb767a6b797e27a1660b3eb27e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 14:48:44 +0300 Subject: KVM: x86 emulator: mark mov cr and mov dr as 64-bit instructions in long mode Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b7adfcc2f74f..20752dc84f10 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = { /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), - D(ModRM | ImplicitOps | Priv), D(ModRM | Priv), + D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264), + D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ -- cgit v1.2.3 From 1a0c7d44e4553ffb4902ec15549a9b855cd05a59 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 14:25:22 +0300 Subject: KVM: x86 emulator: use struct operand for mov reg,cr and mov cr,reg for reg op This is an ordinary modrm source or destination; use the standard structure representing it. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 20752dc84f10..562e0343e2a3 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = { /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264), - D(ModRM | ImplicitOps | Priv | Op3264), D(ModRM | Priv | Op3264), + D(ModRM | DstMem | Priv | Op3264), D(ModRM | Priv | Op3264), + D(ModRM | SrcMem | Priv | Op3264), D(ModRM | Priv | Op3264), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ @@ -3240,8 +3240,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; } - c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); - c->dst.type = OP_NONE; /* no writeback */ + c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); break; case 0x21: /* mov from dr to reg */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && @@ -3253,7 +3252,7 @@ twobyte_insn: c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { + if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { emulate_gp(ctxt, 0); goto done; } -- cgit v1.2.3 From b27f38563d956135a5e80aca749b399ac5f3158a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 14:25:22 +0300 Subject: KVM: x86 emulator: use struct operand for mov reg,dr and mov dr,reg for reg op This is an ordinary modrm source or destination; use the standard structure representing it. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 562e0343e2a3..628fb5de6a42 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2210,8 +2210,8 @@ static struct opcode twobyte_table[256] = { /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | DstMem | Priv | Op3264), D(ModRM | Priv | Op3264), - D(ModRM | SrcMem | Priv | Op3264), D(ModRM | Priv | Op3264), + D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), + D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ @@ -3248,8 +3248,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; } - ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); - c->dst.type = OP_NONE; /* no writeback */ + ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); break; case 0x22: /* mov reg, cr */ if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { @@ -3265,7 +3264,7 @@ twobyte_insn: goto done; } - if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & + if (ops->set_dr(c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), ctxt->vcpu) < 0) { /* #UD condition is already handled by the code above */ -- cgit v1.2.3 From 5a506b125f1c97c846654ebacc913a136284e42b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 15:10:29 +0300 Subject: KVM: x86 emulator: add NoAccess flag for memory instructions that skip access Use for INVLPG, which accesses the tlb, not memory. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 628fb5de6a42..80efe76c1ab8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -83,6 +83,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ /* Misc flags */ +#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ @@ -2067,7 +2068,8 @@ static struct opcode group5[] = { static struct group_dual group7 = { { N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv), + D(SrcMem16 | ModRM | Mov | Priv), + D(SrcMem | ModRM | ByteOp | Priv | NoAccess), }, { D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), D(SrcNone | ModRM | DstMem | Mov), N, @@ -2456,7 +2458,7 @@ done_prefixes: c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + if (c->d & NoAccess) break; srcmem_common: /* -- cgit v1.2.3 From 342fc63095e2d676f209b202d41a3f670dd9bf08 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 15:13:22 +0300 Subject: KVM: x86 emulator: switch LEA to use SrcMem decoding The NoAccess flag will prevent memory from being accessed. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 80efe76c1ab8..b8aa667b52bd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2153,7 +2153,7 @@ static struct opcode opcode_table[256] = { /* 0x88 - 0x8F */ D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov), - D(DstMem | SrcNone | ModRM | Mov), D(ModRM | DstReg), + D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ X8(D(SrcAcc | DstReg)), @@ -2895,7 +2895,7 @@ special_insn: c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); break; case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_ea; + c->dst.val = c->src.addr.mem; break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; -- cgit v1.2.3 From 1f6f05800e2fdd815ac63e3264071d26d429f491 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 15:19:22 +0300 Subject: KVM: x86 emulator: change invlpg emulation to use src.mem.addr Instead of using modrm_ea, which will soon be gone. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b8aa667b52bd..eda69411d050 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3206,7 +3206,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, c->modrm_ea); + emulate_invlpg(ctxt->vcpu, c->src.addr.mem); /* Disable writeback. */ c->dst.type = OP_NONE; break; -- cgit v1.2.3 From 2dbd0dd711e6c0ca6a2be9e6d93bbeb339386638 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 1 Aug 2010 15:40:19 +0300 Subject: KVM: x86 emulator: Decode memory operands directly into a 'struct operand' Since modrm operand can be either register or memory, decoding it into a 'struct operand', which can represent both, is simpler. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 3 - arch/x86/kvm/emulate.c | 125 +++++++++++++++++-------------------- 2 files changed, 57 insertions(+), 71 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e425444658e8..1e4a72ce301a 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -203,9 +203,6 @@ struct decode_cache { u8 modrm_rm; u8 modrm_seg; bool rip_relative; - unsigned long modrm_ea; - void *modrm_ptr; - unsigned long modrm_val; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index eda69411d050..955d48074648 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -581,12 +581,14 @@ static void decode_register_operand(struct operand *op, } static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + struct operand *op) { struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; int rc = X86EMUL_CONTINUE; + ulong modrm_ea = 0; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -598,16 +600,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_mod |= (c->modrm & 0xc0) >> 6; c->modrm_reg |= (c->modrm & 0x38) >> 3; c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; c->modrm_seg = VCPU_SREG_DS; if (c->modrm_mod == 3) { - c->modrm_ptr = decode_register(c->modrm_rm, + op->type = OP_REG; + op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); - c->modrm_val = *(unsigned long *)c->modrm_ptr; + fetch_register_operand(op); return rc; } + op->type = OP_MEM; + if (c->ad_bytes == 2) { unsigned bx = c->regs[VCPU_REGS_RBX]; unsigned bp = c->regs[VCPU_REGS_RBP]; @@ -618,46 +623,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (c->modrm_mod) { case 0: if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->eip); break; case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->eip); break; } switch (c->modrm_rm) { case 0: - c->modrm_ea += bx + si; + modrm_ea += bx + si; break; case 1: - c->modrm_ea += bx + di; + modrm_ea += bx + di; break; case 2: - c->modrm_ea += bp + si; + modrm_ea += bp + si; break; case 3: - c->modrm_ea += bp + di; + modrm_ea += bp + di; break; case 4: - c->modrm_ea += si; + modrm_ea += si; break; case 5: - c->modrm_ea += di; + modrm_ea += di; break; case 6: if (c->modrm_mod != 0) - c->modrm_ea += bp; + modrm_ea += bp; break; case 7: - c->modrm_ea += bx; + modrm_ea += bx; break; } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) c->modrm_seg = VCPU_SREG_SS; - c->modrm_ea = (u16)c->modrm_ea; + modrm_ea = (u16)modrm_ea; } else { /* 32/64-bit ModR/M decode. */ if ((c->modrm_rm & 7) == 4) { @@ -667,48 +672,51 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, scale = sib >> 6; if ((base_reg & 7) == 5 && c->modrm_mod == 0) - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); else - c->modrm_ea += c->regs[base_reg]; + modrm_ea += c->regs[base_reg]; if (index_reg != 4) - c->modrm_ea += c->regs[index_reg] << scale; + modrm_ea += c->regs[index_reg] << scale; } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) c->rip_relative = 1; } else - c->modrm_ea += c->regs[c->modrm_rm]; + modrm_ea += c->regs[c->modrm_rm]; switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); break; case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); break; } } + op->addr.mem = modrm_ea; done: return rc; } static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + struct operand *op) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; + op->type = OP_MEM; switch (c->ad_bytes) { case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); + op->addr.mem = insn_fetch(u16, 2, c->eip); break; case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); + op->addr.mem = insn_fetch(u32, 4, c->eip); break; case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); + op->addr.mem = insn_fetch(u64, 8, c->eip); break; } done: @@ -2280,6 +2288,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt) int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, dual, goffset; struct opcode opcode, *g_mod012, *g_mod3; + struct operand memop = { .type = OP_NONE }; /* we cannot decode insn before we complete previous rep insn */ WARN_ON(ctxt->restart); @@ -2418,25 +2427,25 @@ done_prefixes: /* ModRM and SIB bytes. */ if (c->d & ModRM) { - rc = decode_modrm(ctxt, ops); + rc = decode_modrm(ctxt, ops, &memop); if (!c->has_seg_override) set_seg_override(c, c->modrm_seg); } else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); + rc = decode_abs(ctxt, ops, &memop); if (rc != X86EMUL_CONTINUE) goto done; if (!c->has_seg_override) set_seg_override(c, VCPU_SREG_DS); - if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, ops, c); + if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) + memop.addr.mem += seg_override_base(ctxt, ops, c); - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; + if (memop.type == OP_MEM && c->ad_bytes != 8) + memop.addr.mem = (u32)memop.addr.mem; - if (c->rip_relative) - c->modrm_ea += c->eip; + if (memop.type == OP_MEM && c->rip_relative) + memop.addr.mem += c->eip; /* * Decode and fetch the source operand: register, memory @@ -2449,31 +2458,16 @@ done_prefixes: decode_register_operand(&c->src, c, 0); break; case SrcMem16: - c->src.bytes = 2; + memop.bytes = 2; goto srcmem_common; case SrcMem32: - c->src.bytes = 4; + memop.bytes = 4; goto srcmem_common; case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : + memop.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->d & NoAccess) - break; srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - c->src.val = c->modrm_val; - c->src.addr.reg = c->modrm_ptr; - break; - } - c->src.type = OP_MEM; - c->src.addr.mem = c->modrm_ea; - c->src.val = 0; + c->src = memop; break; case SrcImm: case SrcImmU: @@ -2543,9 +2537,8 @@ done_prefixes: insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); break; case SrcMemFAddr: - c->src.type = OP_MEM; - c->src.addr.mem = c->modrm_ea; - c->src.bytes = c->op_bytes + 2; + memop.bytes = c->op_bytes + 2; + goto srcmem_common; break; } @@ -2583,26 +2576,18 @@ done_prefixes: break; case DstMem: case DstMem64: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.type = OP_REG; - c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.addr.reg = c->modrm_ptr; - break; - } - c->dst.type = OP_MEM; - c->dst.addr.mem = c->modrm_ea; + c->dst = memop; if ((c->d & DstMask) == DstMem64) c->dst.bytes = 8; else c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { + if (c->dst.type == OP_MEM && (c->d & BitOp)) { unsigned long mask = ~(c->dst.bytes * 8 - 1); c->dst.addr.mem = c->dst.addr.mem + (c->src.val & mask) / 8; } + c->dst.orig_val = c->dst.val; break; case DstAcc: c->dst.type = OP_REG; @@ -2682,11 +2667,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (c->src.type == OP_MEM) { + if (c->d & NoAccess) + goto no_fetch; rc = read_emulated(ctxt, ops, c->src.addr.mem, c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val64 = c->src.val64; + no_fetch: + ; } if (c->src2.type == OP_MEM) { -- cgit v1.2.3 From 34698d8c61bd3fc86b2e99c3d1ad9ef140b3eb0d Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 4 Aug 2010 14:41:04 +0300 Subject: KVM: x86 emulator: Fix nop emulation If a nop instruction is encountered, we jump directly to the done label. This skip updating rip. Break from the switch case instead Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 955d48074648..ddbad15c9486 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2912,7 +2912,7 @@ special_insn: break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) - goto done; + break; goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; -- cgit v1.2.3 From ba492962363a02c45836be205f339be48093e1be Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 29 Jul 2010 14:47:56 +0200 Subject: KVM: Move kvm_guest_init out of generic code Currently x86 is the only architecture that uses kvm_guest_init(). With PowerPC we're getting a second user, but the signature is different there and we don't need to export it, as it uses the normal kernel init framework. So let's move the x86 specific definition of that function over to the x86 specfic header file. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 6 ++++++ include/linux/kvm_para.h | 5 ----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 05eba5e9a8e8..7b562b6184bc 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void) return cpuid_eax(KVM_CPUID_FEATURES); } +#ifdef CONFIG_KVM_GUEST +void __init kvm_guest_init(void); +#else +#define kvm_guest_init() do { } while (0) #endif +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index ac2015a25012..47a070b0520e 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h @@ -26,11 +26,6 @@ #include #ifdef __KERNEL__ -#ifdef CONFIG_KVM_GUEST -void __init kvm_guest_init(void); -#else -#define kvm_guest_init() do { } while (0) -#endif static inline int kvm_para_has_feature(unsigned int feature) { -- cgit v1.2.3 From d3ad6243293d92c82530a50c77d71bb0a0a42fdc Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 5 Aug 2010 16:34:39 +0800 Subject: KVM: x86 emulator: simplify two-byte opcode check Two-byte opcode always start with 0x0F and the decode flags of opcode 0xF0 is always 0, so remove dup check. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ddbad15c9486..a9a4a0b78a7d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2375,13 +2375,11 @@ done_prefixes: /* Opcode byte(s). */ opcode = opcode_table[c->b]; - if (opcode.flags == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - opcode = twobyte_table[c->b]; - } + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + opcode = twobyte_table[c->b]; } c->d = opcode.flags; -- cgit v1.2.3 From 160ce1f1a8fe64b3e2686ae73fbf051ccfe7c7ef Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 4 Aug 2010 05:44:24 +0300 Subject: KVM: x86 emulator: Allow accessing IDT via emulator ops The patch adds a new member get_idt() to x86_emulate_ops. It also adds a function to get the idt in order to be used by the emulator. This is needed for real mode interrupt injection and the emulation of int instructions. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1e4a72ce301a..1bbf2b6f2a7e 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -139,6 +139,7 @@ struct x86_emulate_ops { void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); + void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35c0f4e4a621..768197a34d3e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3790,6 +3790,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_idt(vcpu, dt); +} + static unsigned long emulator_get_cached_segment_base(int seg, struct kvm_vcpu *vcpu) { @@ -3883,6 +3888,7 @@ static struct x86_emulate_ops emulate_ops = { .set_segment_selector = emulator_set_segment_selector, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, + .get_idt = emulator_get_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, -- cgit v1.2.3 From 6e154e56b4d7a6a28c54f0984e13d3f8defc4755 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 4 Aug 2010 14:38:06 +0300 Subject: KVM: x86 emulator: Add into, int, and int3 instructions (opcodes 0xcc-0xce) This adds support for int instructions to the emulator. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a9a4a0b78a7d..5205d6890828 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1180,6 +1180,67 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, return rc; } +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + struct desc_ptr dt; + gva_t cs_addr; + gva_t eip_addr; + u16 cs, eip; + u32 err; + + /* TODO: Add limit checks */ + c->src.val = ctxt->eflags; + emulate_push(ctxt, ops); + + ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); + + c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + emulate_push(ctxt, ops); + + c->src.val = c->eip; + emulate_push(ctxt, ops); + + ops->get_idt(&dt, ctxt->vcpu); + + eip_addr = dt.address + (irq << 2); + cs_addr = dt.address + (irq << 2) + 2; + + rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = eip; + + return rc; +} + +static int emulate_int(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_int_real(ctxt, ops, irq); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* Protected mode interrupts unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -2616,6 +2677,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; + int irq; /* Used for int 3, int, and into */ ctxt->decode.mem_read.pos = 0; @@ -2960,6 +3022,22 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; + case 0xcc: /* int3 */ + irq = 3; + goto do_interrupt; + case 0xcd: /* int n */ + irq = c->src.val; + do_interrupt: + rc = emulate_int(ctxt, ops, irq); + if (rc != X86EMUL_CONTINUE) + goto done; + break; + case 0xce: /* into */ + if (ctxt->eflags & EFLG_OF) { + irq = 4; + goto do_interrupt; + } + break; case 0xcf: /* iret */ rc = emulate_iret(ctxt, ops); -- cgit v1.2.3 From 06cb704611caf40e531a3835809283f14f5307d5 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 4 Aug 2010 15:36:53 +0800 Subject: KVM: x86 emulator: use SrcAcc to simplify stos decoding Use SrcAcc to simplify stos decoding. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5205d6890828..6c1e4d6c12cd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2235,7 +2235,8 @@ static struct opcode opcode_table[256] = { D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String), D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String), /* 0xA8 - 0xAF */ - D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | DstDI | Mov | String), D(DstDI | Mov | String), + D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), + D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String), D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String), D(ByteOp | DstDI | String), D(DstDI | String), /* 0xB0 - 0xB7 */ @@ -2996,8 +2997,6 @@ special_insn: case 0xa8 ... 0xa9: /* test ax, imm */ goto test; case 0xaa ... 0xab: /* stos */ - c->dst.val = c->regs[VCPU_REGS_RAX]; - break; case 0xac ... 0xad: /* lods */ goto mov; case 0xae ... 0xaf: /* scas */ -- cgit v1.2.3 From 36089fed70337f4d96a5c3aa7fadc4095b707f73 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 4 Aug 2010 15:38:18 +0800 Subject: KVM: x86 emulator: disable writeback when decode dest operand This patch change to disable writeback when decode dest operand if the dest type is ImplicitOps or not specified. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6c1e4d6c12cd..e0216eb8b574 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2627,9 +2627,6 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; case DstReg: decode_register_operand(&c->dst, c, c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); @@ -2664,6 +2661,11 @@ done_prefixes: c->regs[VCPU_REGS_RDI]); c->dst.val = 0; break; + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + default: + c->dst.type = OP_NONE; /* Disable writeback. */ + return 0; } done: @@ -3115,7 +3117,6 @@ special_insn: case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf6 ... 0xf7: /* Grp3 */ if (!emulate_grp3(ctxt, ops)) @@ -3123,16 +3124,13 @@ special_insn: break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); goto done; - } else { + } else ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - } break; case 0xfb: /* sti */ if (emulator_bad_iopl(ctxt, ops)) { @@ -3141,16 +3139,13 @@ special_insn: } else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfd: /* std */ ctxt->eflags |= EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfe: /* Grp4 */ grp45: @@ -3287,16 +3282,13 @@ twobyte_insn: break; case 0x06: emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; break; case 0x09: /* wbinvd */ kvm_emulate_wbinvd(ctxt->vcpu); - c->dst.type = OP_NONE; break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; break; case 0x20: /* mov cr, reg */ switch (c->modrm_reg) { @@ -3349,7 +3341,6 @@ twobyte_insn: goto done; } rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; break; case 0x32: /* rdmsr */ @@ -3361,7 +3352,6 @@ twobyte_insn: c->regs[VCPU_REGS_RDX] = msr_data >> 32; } rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ rc = emulate_sysenter(ctxt, ops); @@ -3385,7 +3375,6 @@ twobyte_insn: case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; break; case 0xa0: /* push fs */ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); -- cgit v1.2.3 From c034da8b927dc682fe7944895d67f99f07e3740f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 4 Aug 2010 15:38:59 +0800 Subject: KVM: x86 emulator: using SrcOne for instruction d0/d1 decoding Using SrcOne for instruction d0/d1 decoding. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e0216eb8b574..d711d6aa8165 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2251,7 +2251,7 @@ static struct opcode opcode_table[256] = { N, N, N, D(ImplicitOps | Stack), D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), /* 0xD0 - 0xD7 */ - D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), + D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM), D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), N, N, N, N, /* 0xD8 - 0xDF */ @@ -3046,7 +3046,6 @@ special_insn: goto done; break; case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; emulate_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ -- cgit v1.2.3 From 8744aa9aad56be756a58126b429f176898631c3f Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Thu, 5 Aug 2010 15:42:49 +0300 Subject: KVM: x86 emulator: Add stc instruction (opcode 0xf9) Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d711d6aa8165..175b41690d6f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2269,7 +2269,7 @@ static struct opcode opcode_table[256] = { N, N, N, N, D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ - D(ImplicitOps), N, D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), }; @@ -3124,6 +3124,9 @@ special_insn: case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; break; + case 0xf9: /* stc */ + ctxt->eflags |= EFLG_CF; + break; case 0xfa: /* cli */ if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); -- cgit v1.2.3 From 35c843c4857e2a818d1d951d87c40ee2cf5c1be8 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 9 Aug 2010 11:34:56 +0800 Subject: KVM: x86 emulator: fix negative bit offset BitOp instruction emulation If bit offset operands is a negative number, BitOp instruction will return wrong value. This patch fix it. Signed-off-by: Wei Yongjun Reviewed-by: Paolo Bonzini Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 175b41690d6f..5fc441c064ba 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -723,6 +723,22 @@ done: return rc; } +static void fetch_bit_operand(struct decode_cache *c) +{ + long sv, mask; + + if (c->dst.type == OP_MEM) { + mask = ~(c->dst.bytes * 8 - 1); + + if (c->src.bytes == 2) + sv = (s16)c->src.val & (s16)mask; + else if (c->src.bytes == 4) + sv = (s32)c->src.val & (s32)mask; + + c->dst.addr.mem += (sv >> 3); + } +} + static int read_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long addr, void *dest, unsigned size) @@ -2638,12 +2654,8 @@ done_prefixes: c->dst.bytes = 8; else c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->dst.type == OP_MEM && (c->d & BitOp)) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.addr.mem = c->dst.addr.mem + - (c->src.val & mask) / 8; - } + if (c->d & BitOp) + fetch_bit_operand(c); c->dst.orig_val = c->dst.val; break; case DstAcc: -- cgit v1.2.3 From 3885f18fe3034a10b3e3923885d70d31ba522844 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 9 Aug 2010 11:37:37 +0800 Subject: KVM: x86 emulator: do not adjust the address for immediate source adjust the dst address for a register source but not adjust the address for an immediate source. Signed-off-by: Wei Yongjun Reviewed-by: Paolo Bonzini Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5fc441c064ba..9b81cde8ffa2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -727,7 +727,7 @@ static void fetch_bit_operand(struct decode_cache *c) { long sv, mask; - if (c->dst.type == OP_MEM) { + if (c->dst.type == OP_MEM && c->src.type == OP_REG) { mask = ~(c->dst.bytes * 8 - 1); if (c->src.bytes == 2) -- cgit v1.2.3 From ba7ff2b76dcf05c4681c2648019b8301ada6f3df Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 9 Aug 2010 11:39:14 +0800 Subject: KVM: x86 emulator: mask group 8 instruction as BitOp Mask group 8 instruction as BitOp, so we can share the code for adjust the source operand. Signed-off-by: Wei Yongjun Reviewed-by: Paolo Bonzini Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9b81cde8ffa2..a9b2b9e6a3f0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -737,6 +737,9 @@ static void fetch_bit_operand(struct decode_cache *c) c->dst.addr.mem += (sv >> 3); } + + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; } static int read_emulated(struct x86_emulate_ctxt *ctxt, @@ -2336,7 +2339,7 @@ static struct opcode twobyte_table[256] = { D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, - G(0, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ @@ -3419,8 +3422,6 @@ twobyte_insn: break; case 0xab: bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; case 0xac: /* shrd imm8, r, r/m */ @@ -3448,8 +3449,6 @@ twobyte_insn: break; case 0xb3: btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); break; case 0xb6 ... 0xb7: /* movzx */ @@ -3471,8 +3470,6 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); break; case 0xbe ... 0xbf: /* movsx */ -- cgit v1.2.3 From 3f9f53b0d599aabb03db35208fb31768568ca83f Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 8 Aug 2010 21:11:37 +0300 Subject: KVM: x86 emulator: Add unary mul, imul, div, and idiv instructions This adds unary mul, imul, div, and idiv instructions (group 3 r/m 4-7). Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a9b2b9e6a3f0..f0415eab6591 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -315,6 +315,31 @@ struct group_dual { } \ } while (0) +#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "1") \ + _op _suffix " %5; " \ + _POST_EFLAGS("0", "4", "1") \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ + case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ + case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ + } \ + } while (0) + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ @@ -1373,6 +1398,8 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; + unsigned long *rax = &c->regs[VCPU_REGS_RAX]; + unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1384,6 +1411,18 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, case 3: /* neg */ emulate_1op("neg", c->dst, ctxt->eflags); break; + case 4: /* mul */ + emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 5: /* imul */ + emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 6: /* div */ + emulate_1op_rax_rdx("div", c->src, *rax, *rdx, ctxt->eflags); + break; + case 7: /* idiv */ + emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags); + break; default: return 0; } @@ -2138,7 +2177,7 @@ static struct opcode group1A[] = { static struct opcode group3[] = { D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(Undefined)), + X4(D(SrcMem | ModRM)), }; static struct opcode group4[] = { -- cgit v1.2.3 From 8c5eee30a942cb3154f14f12407755ed7da74bbc Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 8 Aug 2010 21:11:38 +0300 Subject: KVM: x86 emulator: Fix emulate_grp3 return values This patch lets emulate_grp3() return X86EMUL_* return codes instead of hardcoded ones. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f0415eab6591..8617c344405d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1424,9 +1424,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags); break; default: - return 0; + return X86EMUL_UNHANDLEABLE; } - return 1; + return X86EMUL_CONTINUE; } static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, @@ -3172,7 +3172,7 @@ special_insn: ctxt->eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ - if (!emulate_grp3(ctxt, ops)) + if (emulate_grp3(ctxt, ops) != X86EMUL_CONTINUE) goto cannot_emulate; break; case 0xf8: /* clc */ -- cgit v1.2.3 From d9574a25afc3cd7ccd6a0bc05252bb84189e4021 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 10 Aug 2010 13:48:22 +0800 Subject: KVM: x86 emulator: add bsf/bsr instruction emulation Add bsf/bsr instruction emulation (opcode 0x0f 0xbc~0xbd) Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8617c344405d..f6b124fcc3fd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2379,8 +2379,8 @@ static struct opcode twobyte_table[256] = { /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), - N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), - D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ N, N, N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), @@ -3511,6 +3511,30 @@ twobyte_insn: btc: /* btc */ emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); break; + case 0xbc: { /* bsf */ + u8 zf; + __asm__ ("bsf %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } + case 0xbd: { /* bsr */ + u8 zf; + __asm__ ("bsr %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } case 0xbe ... 0xbf: /* movsx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : -- cgit v1.2.3 From 8ec4722dd2aab9b69befb919549ea0a5bfc9e670 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 16 Aug 2010 00:47:01 +0300 Subject: KVM: Separate emulation context initialization in a separate function The code for initializing the emulation context is duplicated at two locations (emulate_instruction() and kvm_task_switch()). Separate it in a separate function and call it from there. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 54 +++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 768197a34d3e..c0004eb354d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3931,6 +3931,28 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception); } +static void init_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int cs_db, cs_l; + + cache_all_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +} + static int handle_emulation_failure(struct kvm_vcpu *vcpu) { ++vcpu->stat.insn_emulation_fail; @@ -3987,20 +4009,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, cache_all_regs(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_emulate_ctxt(vcpu); vcpu->arch.emulate_ctxt.interruptibility = 0; vcpu->arch.emulate_ctxt.exception = -1; vcpu->arch.emulate_ctxt.perm_ok = false; @@ -5052,22 +5061,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int cs_db, cs_l, ret; - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + int ret; - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_emulate_ctxt(vcpu); ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, tss_selector, reason, has_error_code, -- cgit v1.2.3 From 31be40b3985f09c0c89b9e28a8206df32adba842 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 17 Aug 2010 09:17:30 +0800 Subject: KVM: x86 emulator: put register operand write back to a function Introduce function write_register_operand() to write back the register operand. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 55 +++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f6b124fcc3fd..003713041ce6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1020,6 +1020,25 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static void write_register_operand(struct operand *op) +{ + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (op->bytes) { + case 1: + *(u8 *)op->addr.reg = (u8)op->val; + break; + case 2: + *(u16 *)op->addr.reg = (u16)op->val; + break; + case 4: + *op->addr.reg = (u32)op->val; + break; /* 64b: zero-extend */ + case 8: + *op->addr.reg = op->val; + break; + } +} + static inline int writeback(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1029,23 +1048,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, switch (c->dst.type) { case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.addr.reg = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.addr.reg = (u16)c->dst.val; - break; - case 4: - *c->dst.addr.reg = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.addr.reg = c->dst.val; - break; - } + write_register_operand(&c->dst); break; case OP_MEM: if (c->lock_prefix) @@ -2970,25 +2973,13 @@ special_insn: case 0x86 ... 0x87: /* xchg */ xchg: /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.addr.reg = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.addr.reg = (u16) c->dst.val; - break; - case 4: - *c->src.addr.reg = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.addr.reg = c->dst.val; - break; - } + c->src.val = c->dst.val; + write_register_operand(&c->src); /* * Write back the memory destination with implicit LOCK * prefix. */ - c->dst.val = c->src.val; + c->dst.val = c->src.orig_val; c->lock_prefix = 1; break; case 0x88 ... 0x8b: /* mov */ -- cgit v1.2.3 From 92f738a52b53dc13b5dd5753634bdb8c59ac9815 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 17 Aug 2010 09:19:34 +0800 Subject: KVM: x86 emulator: add XADD instruction emulation Add XADD instruction emulation (opcode 0x0f 0xc0~0xc1) Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 003713041ce6..0c08bffe6cb4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2385,7 +2385,8 @@ static struct opcode twobyte_table[256] = { D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ - N, N, N, D(DstMem | SrcReg | ModRM | Mov), + D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), N, N, N, N, N, N, N, N, /* 0xD0 - 0xDF */ @@ -3531,6 +3532,12 @@ twobyte_insn: c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : (s16) c->src.val; break; + case 0xc0 ... 0xc1: /* xadd */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + /* Write back the register source. */ + c->src.val = c->dst.orig_val; + write_register_operand(&c->src); + break; case 0xc3: /* movnti */ c->dst.bytes = c->op_bytes; c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : -- cgit v1.2.3 From ee45b58efebc826ea2ade310f6e311702d4a5ab9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 6 Aug 2010 17:10:07 +0800 Subject: KVM: x86 emulator: add setcc instruction emulation Add setcc instruction emulation (opcode 0x0f 0x90~0x9f) Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0c08bffe6cb4..df349f376da8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2362,7 +2362,7 @@ static struct opcode twobyte_table[256] = { /* 0x80 - 0x8F */ X16(D(SrcImm)), /* 0x90 - 0x9F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, D(DstMem | SrcReg | ModRM | BitOp), @@ -3424,6 +3424,9 @@ twobyte_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; + case 0x90 ... 0x9f: /* setcc r/m8 */ + c->dst.val = test_cc(c->b, ctxt->eflags); + break; case 0xa0: /* push fs */ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; -- cgit v1.2.3 From c483c02ad35256206d6c45d7170fef1e33a43e9c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 6 Aug 2010 15:36:36 +0800 Subject: KVM: x86 emulator: remove useless label from x86_emulate_insn() Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index df349f376da8..78541e8fd149 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2787,16 +2787,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) c->eip = ctxt->eip; } - if (c->src.type == OP_MEM) { - if (c->d & NoAccess) - goto no_fetch; + if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { rc = read_emulated(ctxt, ops, c->src.addr.mem, c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val64 = c->src.val64; - no_fetch: - ; } if (c->src2.type == OP_MEM) { -- cgit v1.2.3 From 943858e27544cd10e6095093a40be911a31892b1 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 6 Aug 2010 11:36:51 +0800 Subject: KVM: x86 emulator: introduce DstImmUByte for dst operand decode Introduce DstImmUByte for dst operand decode, which will be used for out instruction. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 78541e8fd149..dc074a0c60ca 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -54,6 +54,7 @@ #define DstAcc (4<<1) /* Destination Accumulator */ #define DstDI (5<<1) /* Destination is in ES:(E)DI */ #define DstMem64 (6<<1) /* 64bit memory operand */ +#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ @@ -2693,6 +2694,12 @@ done_prefixes: decode_register_operand(&c->dst, c, c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; + case DstImmUByte: + c->dst.type = OP_IMM; + c->dst.addr.mem = c->eip; + c->dst.bytes = 1; + c->dst.val = insn_fetch(u8, 1, c->eip); + break; case DstMem: case DstMem64: c->dst = memop; -- cgit v1.2.3 From 41167be544603e077b866a2922737556dc2294e8 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 6 Aug 2010 11:45:12 +0800 Subject: KVM: x86 emulator: change OUT instruction to use dst instead of src Change OUT instruction to use dst instead of src, so we can reuse those code for all out instructions. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index dc074a0c60ca..8e12e1b11ff8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2321,12 +2321,12 @@ static struct opcode opcode_table[256] = { /* 0xE0 - 0xE7 */ N, N, N, N, D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), - D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), + D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), - D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), + D(ByteOp | SrcAcc | ImplicitOps), D(SrcAcc | ImplicitOps), /* 0xF0 - 0xF7 */ N, N, N, N, D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), @@ -3148,15 +3148,16 @@ special_insn: break; case 0xee: /* out dx,al */ case 0xef: /* out dx,(e/r)ax */ - c->src.val = c->regs[VCPU_REGS_RDX]; + c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->dst.val, + c->src.bytes)) { emulate_gp(ctxt, 0); goto done; } - ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, - ctxt->vcpu); + ops->pio_out_emulated(c->src.bytes, c->dst.val, + &c->src.val, 1, ctxt->vcpu); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ -- cgit v1.2.3 From a13a63faa6237001ed80d4f4051fc028dace10d9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 6 Aug 2010 11:46:12 +0800 Subject: KVM: x86 emulator: remove dup code of in/out instruction Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8e12e1b11ff8..cffe7c2819ed 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2923,28 +2923,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - c->dst.bytes)) { - emulate_gp(ctxt, 0); - goto done; - } - if (!pio_in_emulated(ctxt, ops, c->dst.bytes, - c->regs[VCPU_REGS_RDX], &c->dst.val)) - goto done; /* IO is needed, skip writeback */ - break; + c->src.val = c->regs[VCPU_REGS_RDX]; + goto do_io_in; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - c->src.bytes)) { - emulate_gp(ctxt, 0); - goto done; - } - ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], - &c->src.val, 1, ctxt->vcpu); - - c->dst.type = OP_NONE; /* nothing to writeback */ + c->dst.val = c->regs[VCPU_REGS_RDX]; + goto do_io_out; break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) -- cgit v1.2.3 From 5c56e1cf7a758c4772e2470b4346a8219ec7f44e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Aug 2010 11:17:51 +0300 Subject: KVM: x86 emulator: fix INTn emulation not pushing EFLAGS and CS emulate_push() only schedules a push; it doesn't actually push anything. Call writeback() to flush out the write. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cffe7c2819ed..b89a20ec7c9d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1232,7 +1232,7 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, int irq) { struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; + int rc; struct desc_ptr dt; gva_t cs_addr; gva_t eip_addr; @@ -1242,14 +1242,25 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add limit checks */ c->src.val = ctxt->eflags; emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; c->src.val = c->eip; emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; ops->get_idt(&dt, ctxt->vcpu); -- cgit v1.2.3 From f6b33fc5046642b669c3197bf08639172e4cffad Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Aug 2010 11:20:37 +0300 Subject: KVM: x86 emulator: implement SCAS (opcodes AE, AF) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b89a20ec7c9d..09c9210db75d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2311,7 +2311,7 @@ static struct opcode opcode_table[256] = { D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String), D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String), - D(ByteOp | DstDI | String), D(DstDI | String), + D(ByteOp | SrcAcc | DstDI | String), D(SrcAcc | DstDI | String), /* 0xB0 - 0xB7 */ X8(D(ByteOp | DstReg | SrcImm | Mov)), /* 0xB8 - 0xBF */ @@ -3046,8 +3046,7 @@ special_insn: case 0xac ... 0xad: /* lods */ goto mov; case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; + goto cmp; case 0xb0 ... 0xbf: /* mov r, imm */ goto mov; case 0xc0 ... 0xc1: -- cgit v1.2.3 From 0fa6ccbd281221bc7d46aff82d846e1f4c1985df Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Aug 2010 11:22:17 +0300 Subject: KVM: x86 emulator: fix REPZ/REPNZ termination condition EFLAGS.ZF needs to be checked after each iteration, not before. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 09c9210db75d..aab62d50752e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2781,28 +2781,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->restart = true; /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - string_done: ctxt->restart = false; ctxt->eip = c->eip; goto done; } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) - goto string_done; - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) - goto string_done; - } - c->eip = ctxt->eip; } if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { @@ -3229,20 +3211,37 @@ writeback: if (c->rep_prefix && (c->d & String)) { struct read_cache *rc = &ctxt->decode.io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if (((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) + && (((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) + || ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + ctxt->restart = false; /* * Re-enter guest when pio read ahead buffer is empty or, * if it is not used, after each 1024 iteration. */ - if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || - (rc->end != 0 && rc->end == rc->pos)) + else if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || + (rc->end != 0 && rc->end == rc->pos)) { ctxt->restart = false; + c->eip = ctxt->eip; + } } /* * reset read cache here in case string instruction is restared * without decoding */ ctxt->decode.mem_read.end = 0; - ctxt->eip = c->eip; + if (!ctxt->restart) + ctxt->eip = c->eip; done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -- cgit v1.2.3 From e8b6fa70e3545f0afd63434dbd0c5220d47205f6 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 18 Aug 2010 16:43:13 +0800 Subject: KVM: x86 emulator: add CBW/CWDE/CDQE instruction emulation Add CBW/CWDE/CDQE instruction emulation.(opcode 0x98) Used by FreeBSD's boot loader. Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index aab62d50752e..312dda57f93b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2300,7 +2300,7 @@ static struct opcode opcode_table[256] = { /* 0x90 - 0x97 */ X8(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ - N, N, D(SrcImmFAddr | No64), N, + D(DstAcc | SrcNone), N, D(SrcImmFAddr | No64), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, /* 0xA0 - 0xA7 */ D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), @@ -3003,6 +3003,13 @@ special_insn: if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) break; goto xchg; + case 0x98: /* cbw/cwde/cdqe */ + switch (c->op_bytes) { + case 2: c->dst.val = (s8)c->dst.val; break; + case 4: c->dst.val = (s16)c->dst.val; break; + case 8: c->dst.val = (s32)c->dst.val; break; + } + break; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; emulate_push(ctxt, ops); -- cgit v1.2.3 From f2f31845341d22e4f20438b05e83d58e71b723b5 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 18 Aug 2010 16:38:21 +0800 Subject: KVM: x86 emulator: add LOOP/LOOPcc instruction emulation Add LOOP/LOOPcc instruction emulation (opcode 0xe0~0xe2). Signed-off-by: Wei Yongjun Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 312dda57f93b..2f816edfe31e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2330,7 +2330,7 @@ static struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ - N, N, N, N, + X3(D(SrcImmByte)), N, D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte), /* 0xE8 - 0xEF */ @@ -3084,6 +3084,12 @@ special_insn: c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && + (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) + jmp_rel(c, c->src.val); + break; case 0xe4: /* inb */ case 0xe5: /* in */ goto do_io_in; -- cgit v1.2.3 From b3b3d25a12986fb08666823db3e9a74649a71925 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Aug 2010 17:49:52 +0300 Subject: KVM: x86 emulator: pass destination type to ____emulate_2op() We'll need it later so we can use a register for the destination. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2f816edfe31e..7818c91deb63 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -194,13 +194,13 @@ struct group_dual { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ + : "=m" (_eflags), "=m" (*(_dsttype*)&(_dst).val),\ "=&r" (_tmp) \ : _y ((_src).val), "i" (EFLAGS_MASK)); \ } while (0) @@ -213,13 +213,13 @@ struct group_dual { \ switch ((_dst).bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ break; \ } \ } while (0) @@ -229,7 +229,7 @@ struct group_dual { unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ -- cgit v1.2.3 From fb2c264105c64511dbd1a7488b482960895aace4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 16 Aug 2010 17:50:56 +0300 Subject: KVM: x86 emulator: Use a register for ____emulate_2op() destination Most x86 two operand instructions allow the destination to be a memory operand, but IMUL (for example) requires that the destination be a register. Change ____emulate_2op() to take a register for both source and destination so we can invoke IMUL. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7818c91deb63..81b0f8848960 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -200,7 +200,7 @@ struct group_dual { _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" (*(_dsttype*)&(_dst).val),\ + : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ "=&r" (_tmp) \ : _y ((_src).val), "i" (EFLAGS_MASK)); \ } while (0) -- cgit v1.2.3 From 7af04fc05cc185869271927eb470de3d25064b4a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 14:16:35 +0300 Subject: KVM: x86 emulator: implement DAS (opcode 2F) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 81b0f8848960..83ded7c03d12 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2175,6 +2175,45 @@ static int em_push(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_das(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u8 al, old_al; + bool af, cf, old_cf; + + cf = ctxt->eflags & X86_EFLAGS_CF; + al = c->dst.val; + + old_al = al; + old_cf = cf; + cf = false; + af = ctxt->eflags & X86_EFLAGS_AF; + if ((al & 0x0f) > 9 || af) { + al -= 6; + cf = old_cf | (al >= 250); + af = true; + } else { + af = false; + } + if (old_al > 0x99 || old_cf) { + al -= 0x60; + cf = true; + } + + c->dst.val = al; + /* Set PF, ZF, SF */ + c->src.type = OP_IMM; + c->src.val = 0; + c->src.bytes = 1; + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); + if (cf) + ctxt->eflags |= X86_EFLAGS_CF; + if (af) + ctxt->eflags |= X86_EFLAGS_AF; + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2258,7 +2297,8 @@ static struct opcode opcode_table[256] = { /* 0x28 - 0x2F */ D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, + D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), + N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), -- cgit v1.2.3 From 0ef753b8c323f5b8d75d7dc57ceef6b35982afdb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 14:51:45 +0300 Subject: KVM: x86 emulator: implement CALL FAR (FF /3) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 83ded7c03d12..313357793968 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2214,6 +2214,40 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_call_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u16 sel, old_cs; + ulong old_eip; + int rc; + + old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_eip = c->eip; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) + return X86EMUL_CONTINUE; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + + c->src.val = old_cs; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = old_eip; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2241,7 +2275,8 @@ static struct opcode group4[] = { static struct opcode group5[] = { D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - D(SrcMem | ModRM | Stack), N, + D(SrcMem | ModRM | Stack), + I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), D(SrcMem | ModRM | Stack), N, }; -- cgit v1.2.3 From b250e605895d02cede78922d034f7825af72a8b5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 15:11:24 +0300 Subject: KVM: x86 emulator: add SrcImmU16 operand type Used for RET NEAR instructions. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 313357793968..db80e28471da 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -72,6 +72,7 @@ #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ #define SrcAcc (0xd<<4) /* Source Accumulator */ +#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -2678,13 +2679,17 @@ done_prefixes: srcmem_common: c->src = memop; break; + case SrcImmU16: + c->src.bytes = 2; + goto srcimm; case SrcImm: case SrcImmU: - c->src.type = OP_IMM; - c->src.addr.mem = c->eip; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; if (c->src.bytes == 8) c->src.bytes = 4; + srcimm: + c->src.type = OP_IMM; + c->src.addr.mem = c->eip; /* NB. Immediates are sign-extended as necessary. */ switch (c->src.bytes) { case 1: @@ -2697,7 +2702,8 @@ done_prefixes: c->src.val = insn_fetch(s32, 4, c->eip); break; } - if ((c->d & SrcMask) == SrcImmU) { + if ((c->d & SrcMask) == SrcImmU + || (c->d & SrcMask) == SrcImmU16) { switch (c->src.bytes) { case 1: c->src.val &= 0xff; -- cgit v1.2.3 From 40ece7c7297da90e54e147d3bfbb4531f9fbc570 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 15:12:09 +0300 Subject: KVM: x86 emulator: implement RET imm16 (opcode C2) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index db80e28471da..9e58f5054c39 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2249,6 +2249,21 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + c->dst.type = OP_REG; + c->dst.addr.reg = &c->eip; + c->dst.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2394,7 +2409,9 @@ static struct opcode opcode_table[256] = { X8(D(DstReg | SrcImm | Mov)), /* 0xC0 - 0xC7 */ D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM), - N, D(ImplicitOps | Stack), N, N, + I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), + D(ImplicitOps | Stack), + N, N, D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), -- cgit v1.2.3 From f3a1b9f49647133e8c6eb6a68399ed8dbd61554a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 18:25:25 +0300 Subject: KVM: x86 emulator: implement IMUL REG, R/M, imm8 (opcode 6B) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9e58f5054c39..618386f80518 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2264,6 +2264,15 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_imul_3op(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.val = c->src2.val; + emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2371,7 +2380,8 @@ static struct opcode opcode_table[256] = { N, N, N, N, /* 0x68 - 0x6F */ I(SrcImm | Mov | Stack, em_push), N, - I(SrcImmByte | Mov | Stack, em_push), N, + I(SrcImmByte | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ -- cgit v1.2.3 From 5c82aa29988c0160d91f75cceebd0a07d8f2406b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 18:31:43 +0300 Subject: KVM: x86 emulator: implement IMUL REG, R/M (opcode 0F AF) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 618386f80518..a4d2a469b4ab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2264,15 +2264,22 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_imul_3op(struct x86_emulate_ctxt *ctxt) +static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - c->dst.val = c->src2.val; emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); return X86EMUL_CONTINUE; } +static int em_imul_3op(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.val = c->src2.val; + return em_imul(ctxt); +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2488,7 +2495,7 @@ static struct opcode twobyte_table[256] = { N, D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), N, + D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | BitOp | Lock), -- cgit v1.2.3 From 7077aec0bcd2f827aeb84ccc56c6f4367c376436 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 18:53:43 +0300 Subject: KVM: x86 emulator: remove SrcImplicit Useless. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a4d2a469b4ab..7f7fc646678a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -58,7 +58,6 @@ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ -#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ #define SrcReg (1<<4) /* Register operand. */ #define SrcMem (2<<4) /* Memory operand. */ #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ @@ -2435,7 +2434,7 @@ static struct opcode opcode_table[256] = { D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), /* 0xD0 - 0xD7 */ D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM), - D(ByteOp | DstMem | SrcImplicit | ModRM), D(DstMem | SrcImplicit | ModRM), + D(ByteOp | DstMem | ModRM), D(DstMem | ModRM), N, N, N, N, /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, -- cgit v1.2.3 From 48bb5d3c401679e41e7a7f06ca31b3e54a6168f7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 18:54:34 +0300 Subject: KVM: x86 emulator: implement RDTSC (opcode 0F 31) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7f7fc646678a..ed192d220201 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2279,6 +2279,22 @@ static int em_imul_3op(struct x86_emulate_ctxt *ctxt) return em_imul(ctxt); } +static int em_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); + struct decode_cache *c = &ctxt->decode; + u64 tsc = 0; + + if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + c->regs[VCPU_REGS_RAX] = (u32)tsc; + c->regs[VCPU_REGS_RDX] = tsc >> 32; + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2469,7 +2485,8 @@ static struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N, + D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), + D(ImplicitOps | Priv), N, D(ImplicitOps), D(ImplicitOps | Priv), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ -- cgit v1.2.3 From 39f21ee546cf7d563d813c5fb4473431c1d8fce7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 19:20:21 +0300 Subject: KVM: x86 emulator: consolidate immediate decode into a function Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 109 +++++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ed192d220201..95543a6beb53 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2541,6 +2541,55 @@ static struct opcode twobyte_table[256] = { #undef GD #undef I +static unsigned imm_size(struct decode_cache *c) +{ + unsigned size; + + size = (c->d & ByteOp) ? 1 : c->op_bytes; + if (size == 8) + size = 4; + return size; +} + +static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned size, bool sign_extension) +{ + struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; + int rc = X86EMUL_CONTINUE; + + op->type = OP_IMM; + op->bytes = size; + op->addr.mem = c->eip; + /* NB. Immediates are sign-extended as necessary. */ + switch (op->bytes) { + case 1: + op->val = insn_fetch(s8, 1, c->eip); + break; + case 2: + op->val = insn_fetch(s16, 2, c->eip); + break; + case 4: + op->val = insn_fetch(s32, 4, c->eip); + break; + } + if (!sign_extension) { + switch (op->bytes) { + case 1: + op->val &= 0xff; + break; + case 2: + op->val &= 0xffff; + break; + case 4: + op->val &= 0xffffffff; + break; + } + } +done: + return rc; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt) { @@ -2730,52 +2779,19 @@ done_prefixes: c->src = memop; break; case SrcImmU16: - c->src.bytes = 2; - goto srcimm; + rc = decode_imm(ctxt, &c->src, 2, false); + break; case SrcImm: + rc = decode_imm(ctxt, &c->src, imm_size(c), true); + break; case SrcImmU: - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - srcimm: - c->src.type = OP_IMM; - c->src.addr.mem = c->eip; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - if ((c->d & SrcMask) == SrcImmU - || (c->d & SrcMask) == SrcImmU16) { - switch (c->src.bytes) { - case 1: - c->src.val &= 0xff; - break; - case 2: - c->src.val &= 0xffff; - break; - case 4: - c->src.val &= 0xffffffff; - break; - } - } + rc = decode_imm(ctxt, &c->src, imm_size(c), false); break; case SrcImmByte: + rc = decode_imm(ctxt, &c->src, 1, true); + break; case SrcImmUByte: - c->src.type = OP_IMM; - c->src.addr.mem = c->eip; - c->src.bytes = 1; - if ((c->d & SrcMask) == SrcImmByte) - c->src.val = insn_fetch(s8, 1, c->eip); - else - c->src.val = insn_fetch(u8, 1, c->eip); + rc = decode_imm(ctxt, &c->src, 1, false); break; case SrcAcc: c->src.type = OP_REG; @@ -2807,6 +2823,9 @@ done_prefixes: break; } + if (rc != X86EMUL_CONTINUE) + goto done; + /* * Decode and fetch the second source operand: register, memory * or immediate. @@ -2819,10 +2838,7 @@ done_prefixes: c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; break; case Src2ImmByte: - c->src2.type = OP_IMM; - c->src2.addr.mem = c->eip; - c->src2.bytes = 1; - c->src2.val = insn_fetch(u8, 1, c->eip); + rc = decode_imm(ctxt, &c->src2, 1, true); break; case Src2One: c->src2.bytes = 1; @@ -2830,6 +2846,9 @@ done_prefixes: break; } + if (rc != X86EMUL_CONTINUE) + goto done; + /* Decode and fetch the destination operand: register or memory. */ switch (c->d & DstMask) { case DstReg: -- cgit v1.2.3 From 7db41eb76244ae623de842e818e459755968a33b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 19:25:28 +0300 Subject: KVM: x86 emulator: add Src2Imm decoding Needed for 3-operand IMUL. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 95543a6beb53..f456d7e11b3e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -95,6 +95,7 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) +#define Src2Imm (4<<29) #define Src2Mask (7<<29) #define X2(x...) x, x @@ -2844,6 +2845,9 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = 1; break; + case Src2Imm: + rc = decode_imm(ctxt, &c->src2, imm_size(c), true); + break; } if (rc != X86EMUL_CONTINUE) -- cgit v1.2.3 From d46164dbd936bc11c7d2abed62f05b31c7a79ae7 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 18 Aug 2010 19:29:33 +0300 Subject: KVM: x86 emulator: implement IMUL REG, R/M, IMM (opcode 69) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f456d7e11b3e..55849c3d5d8c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2402,7 +2402,8 @@ static struct opcode opcode_table[256] = { N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , N, N, N, N, /* 0x68 - 0x6F */ - I(SrcImm | Mov | Stack, em_push), N, + I(SrcImm | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ -- cgit v1.2.3 From 61429142802b068609ffd8ef48d891e05eeea0b9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 19 Aug 2010 15:13:00 +0300 Subject: KVM: x86 emulator: implement CWD (opcode 99) Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 55849c3d5d8c..e257f2286866 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2280,6 +2280,18 @@ static int em_imul_3op(struct x86_emulate_ctxt *ctxt) return em_imul(ctxt); } +static int em_cwd(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_REG; + c->dst.bytes = c->src.bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; + c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); + + return X86EMUL_CONTINUE; +} + static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); @@ -2425,7 +2437,8 @@ static struct opcode opcode_table[256] = { /* 0x90 - 0x97 */ X8(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ - D(DstAcc | SrcNone), N, D(SrcImmFAddr | No64), N, + D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), + D(SrcImmFAddr | No64), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, /* 0xA0 - 0xA7 */ D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), -- cgit v1.2.3 From e0df7b9f6cee43c01d6f4a8491bccfd410cb86e1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 19 Aug 2010 18:11:05 -0700 Subject: KVM: abstract kvm x86 mmu->n_free_mmu_pages "free" is a poor name for this value. In this context, it means, "the number of mmu pages which this kvm instance should be able to allocate." But "free" implies much more that the objects are there and ready for use. "available" is a much better description, especially when you see how it is calculated. In this patch, we abstract its use into a function. We'll soon replace the function's contents by calculating the value in a different way. All of the reads of n_free_mmu_pages are taken care of in this patch. The modification sites will be handled in a patch later in the series. Signed-off-by: Dave Hansen Signed-off-by: Tim Pepper Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 ++++------- arch/x86/kvm/mmu.h | 7 ++++++- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff95d418750d..625b17894661 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) int used_pages; LIST_HEAD(invalid_list); - used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; + used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm); used_pages = max(0, used_pages); /* @@ -2959,18 +2959,15 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - int free_pages; LIST_HEAD(invalid_list); - free_pages = vcpu->kvm->arch.n_free_mmu_pages; - while (free_pages < KVM_REFILL_PAGES && + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3145,7 +3142,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; + kvm_mmu_available_pages(kvm); cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index be66759321a5..c3a689ae7df0 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -50,9 +50,14 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +{ + return kvm->arch.n_free_mmu_pages; +} + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) __kvm_mmu_free_some_pages(vcpu); } -- cgit v1.2.3 From 39de71ec5397f374aed95e99509372d605e1407c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 19 Aug 2010 18:11:14 -0700 Subject: KVM: rename x86 kvm->arch.n_alloc_mmu_pages arch.n_alloc_mmu_pages is a poor choice of name. This value truly means, "the number of pages which _may_ be allocated". But, reading the name, "n_alloc_mmu_pages" implies "the number of allocated mmu pages", which is dead wrong. It's really the high watermark, so let's give it a name to match: nr_max_mmu_pages. This change will make the next few patches much more obvious and easy to read. Signed-off-by: Dave Hansen Signed-off-by: Tim Pepper Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 8 ++++---- arch/x86/kvm/x86.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c52e2eb40a1e..02963684cd28 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -369,7 +369,7 @@ struct kvm_vcpu_arch { struct kvm_arch { unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; + unsigned int n_max_mmu_pages; atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 625b17894661..6979e7d1464e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1696,7 +1696,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) int used_pages; LIST_HEAD(invalid_list); - used_pages = kvm->arch.n_alloc_mmu_pages - kvm_mmu_available_pages(kvm); + used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm); used_pages = max(0, used_pages); /* @@ -1721,9 +1721,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) } else kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_alloc_mmu_pages; + - kvm->arch.n_max_mmu_pages; - kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; + kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -3141,7 +3141,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - npages = kvm->arch.n_alloc_mmu_pages - + npages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm); cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0004eb354d3..4b4d2836240f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2759,7 +2759,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_alloc_mmu_pages; + return kvm->arch.n_max_mmu_pages; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -- cgit v1.2.3 From 49d5ca26636cb8feb05aff92fc4dba3e494ec683 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 19 Aug 2010 18:11:28 -0700 Subject: KVM: replace x86 kvm n_free_mmu_pages with n_used_mmu_pages Doing this makes the code much more readable. That's borne out by the fact that this patch removes code. "used" also happens to be the number that we need to return back to the slab code when our shrinker gets called. Keeping this value as opposed to free makes the next patch simpler. So, 'struct kvm' is kzalloc()'d. 'struct kvm_arch' is a structure member (and not a pointer) of 'struct kvm'. That means they start out zeroed. I _think_ they get initialized properly by kvm_mmu_change_mmu_pages(). But, that only happens via kvm ioctls. Another benefit of storing 'used' intead of 'free' is that the values are consistent from the moment the structure is allocated: no negative "used" value. Signed-off-by: Dave Hansen Signed-off-by: Tim Pepper Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 27 +++++++++------------------ arch/x86/kvm/mmu.h | 3 ++- 3 files changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 02963684cd28..e01b72825564 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -367,7 +367,7 @@ struct kvm_vcpu_arch { }; struct kvm_arch { - unsigned int n_free_mmu_pages; + unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_max_mmu_pages; atomic_t invlpg_counter; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6979e7d1464e..ff39b85d7a4d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -980,7 +980,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->role.direct) __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); - ++kvm->arch.n_free_mmu_pages; + --kvm->arch.n_used_mmu_pages; } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -1003,7 +1003,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; - --vcpu->kvm->arch.n_free_mmu_pages; + ++vcpu->kvm->arch.n_used_mmu_pages; return sp; } @@ -1689,41 +1689,32 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, /* * Changing the number of mmu pages allocated to the vm - * Note: if kvm_nr_mmu_pages is too small, you will get dead lock + * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) { - int used_pages; LIST_HEAD(invalid_list); - - used_pages = kvm->arch.n_max_mmu_pages - kvm_mmu_available_pages(kvm); - used_pages = max(0, used_pages); - /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages && + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_prepare_zap_page(kvm, page, + kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); - kvm_nr_mmu_pages = used_pages; - kvm->arch.n_free_mmu_pages = 0; + goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } - else - kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_max_mmu_pages; - kvm->arch.n_max_mmu_pages = kvm_nr_mmu_pages; + kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c3a689ae7df0..f05a03dfba4e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,7 +52,8 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { - return kvm->arch.n_free_mmu_pages; + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; } static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 45221ab6684a82a5b60208b76d6f8bfb1bbcb969 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 19 Aug 2010 18:11:37 -0700 Subject: KVM: create aggregate kvm_total_used_mmu_pages value Of slab shrinkers, the VM code says: * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is * querying the cache size, so a fastpath for that case is appropriate. and it *means* it. Look at how it calls the shrinkers: nr_before = (*shrinker->shrink)(0, gfp_mask); shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); So, if you do anything stupid in your shrinker, the VM will doubly punish you. The mmu_shrink() function takes the global kvm_lock, then acquires every VM's kvm->mmu_lock in sequence. If we have 100 VMs, then we're going to take 101 locks. We do it twice, so each call takes 202 locks. If we're under memory pressure, we can have each cpu trying to do this. It can get really hairy, and we've seen lock spinning in mmu_shrink() be the dominant entry in profiles. This is guaranteed to optimize at least half of those lock aquisitions away. It removes the need to take any of the locks when simply trying to count objects. A 'percpu_counter' can be a large object, but we only have one of these for the entire system. There are not any better alternatives at the moment, especially ones that handle CPU hotplug. Signed-off-by: Dave Hansen Signed-off-by: Tim Pepper Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff39b85d7a4d..33d7af50cf8e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -178,6 +178,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; +static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; @@ -971,6 +972,18 @@ static int is_empty_shadow_page(u64 *spt) } #endif +/* + * This value is the sum of all of the kvm instances's + * kvm->arch.n_used_mmu_pages values. We need a global, + * aggregate version in order to make the slab shrinker + * faster + */ +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +{ + kvm->arch.n_used_mmu_pages += nr; + percpu_counter_add(&kvm_total_used_mmu_pages, nr); +} + static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); @@ -980,7 +993,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->role.direct) __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); - --kvm->arch.n_used_mmu_pages; + kvm_mod_used_mmu_pages(kvm, -1); } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -1003,7 +1016,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; - ++vcpu->kvm->arch.n_used_mmu_pages; + kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -3122,23 +3135,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; - int cache_count = 0; + + if (nr_to_scan == 0) + goto out; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages, idx, freed_pages; + int idx, freed_pages; LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - npages = kvm->arch.n_max_mmu_pages - - kvm_mmu_available_pages(kvm); - cache_count += npages; - if (!kvm_freed && nr_to_scan > 0 && npages > 0) { + if (!kvm_freed && nr_to_scan > 0 && + kvm->arch.n_used_mmu_pages > 0) { freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); - cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; @@ -3152,7 +3164,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) spin_unlock(&kvm_lock); - return cache_count; +out: + return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { @@ -3195,6 +3208,7 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; + percpu_counter_init(&kvm_total_used_mmu_pages, 0); register_shrinker(&mmu_shrinker); return 0; -- cgit v1.2.3 From 09b5f4d3c4aa2d4928c0a3723a8de26a76b6339e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 23 Aug 2010 14:56:54 +0800 Subject: KVM: x86 emulator: add LDS/LES/LFS/LGS/LSS instruction emulation Add LDS/LES/LFS/LGS/LSS instruction emulation. (opcode 0xc4, 0xc5, 0x0f 0xb2, 0x0f 0xb4~0xb5) Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e257f2286866..aece501edce4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1514,6 +1514,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, return rc; } +static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + unsigned short sel; + int rc; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ops, sel, seg); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.val = c->src.val; + return rc; +} + static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2458,7 +2475,7 @@ static struct opcode opcode_table[256] = { D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), D(ImplicitOps | Stack), - N, N, + D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), @@ -2529,9 +2546,9 @@ static struct opcode twobyte_table[256] = { D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), - N, N, D(ByteOp | DstReg | SrcMem | ModRM | Mov), - D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), @@ -3214,6 +3231,16 @@ special_insn: c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; goto pop_instruction; + case 0xc4: /* les */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); + if (rc != X86EMUL_CONTINUE) + goto done; + break; + case 0xc5: /* lds */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ mov: c->dst.val = c->src.val; @@ -3659,10 +3686,25 @@ twobyte_insn: c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; } break; + case 0xb2: /* lss */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xb3: btr: /* btr */ emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); break; + case 0xb4: /* lfs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; + case 0xb5: /* lgs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xb6 ... 0xb7: /* movzx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (u8) c->src.val -- cgit v1.2.3 From e4abac67b756680c63af369f053d11991616aeb4 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 19 Aug 2010 14:25:48 +0800 Subject: KVM: x86 emulator: add JrCXZ instruction emulation Add JrCXZ instruction emulation (opcode 0xe3) Used by FreeBSD boot loader. Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index aece501edce4..312e798d5425 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2487,7 +2487,7 @@ static struct opcode opcode_table[256] = { /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ - X3(D(SrcImmByte)), N, + X4(D(SrcImmByte)), D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte), /* 0xE8 - 0xEF */ @@ -3285,6 +3285,10 @@ special_insn: (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) jmp_rel(c, c->src.val); break; + case 0xe3: /* jcxz/jecxz/jrcxz */ + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) + jmp_rel(c, c->src.val); + break; case 0xe4: /* inb */ case 0xe5: /* in */ goto do_io_in; -- cgit v1.2.3 From 80b63faf028fba79e630d3643b0e615bddf4067b Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 24 Aug 2010 10:31:07 +0800 Subject: KVM: MMU: fix regression from rework mmu_shrink() code Latest kvm mmu_shrink code rework makes kernel changes kvm->arch.n_used_mmu_pages/ kvm->arch.n_max_mmu_pages at kvm_mmu_free_page/kvm_mmu_alloc_page, which is called by kvm_mmu_commit_zap_page. So the kvm->arch.n_used_mmu_pages or kvm_mmu_available_pages(vcpu->kvm) is unchanged after kvm_mmu_prepare_zap_page(), This caused kvm_mmu_change_mmu_pages/__kvm_mmu_free_some_pages loops forever. Moving kvm_mmu_commit_zap_page would make the while loop performs as normal. Reported-by: Avi Kivity Signed-off-by: Xiaotian Feng Tested-by: Avi Kivity Cc: Marcelo Tosatti Cc: Dave Hansen Cc: Tim Pepper Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 33d7af50cf8e..c2ac7004441a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1720,10 +1720,9 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_prepare_zap_page(kvm, page, - &invalid_list); + kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } - kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -2972,9 +2971,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) -- cgit v1.2.3 From 45bf21a8ce7a2884f067a702a5c7683684846ce1 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 23 Aug 2010 16:13:15 +0800 Subject: KVM: MMU: fix missing percpu counter destroy commit ad05c88266b4cce1c820928ce8a0fb7690912ba1 (KVM: create aggregate kvm_total_used_mmu_pages value) introduce percpu counter kvm_total_used_mmu_pages but never destroy it, this may cause oops when rmmod & modprobe. Signed-off-by: Wei Yongjun Acked-by: Tim Pepper Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c2ac7004441a..54a50268cebf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3185,6 +3185,7 @@ static void mmu_destroy_caches(void) void kvm_mmu_module_exit(void) { mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); } @@ -3207,7 +3208,9 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - percpu_counter_init(&kvm_total_used_mmu_pages, 0); + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + goto nomem; + register_shrinker(&mmu_shrinker); return 0; -- cgit v1.2.3 From ae38436b78a8abff767e2ac10e2cd663a7eef476 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:15 -1000 Subject: KVM: x86: Drop vm_init_tsc This is used only by the VMX code, and is not done properly; if the TSC is indeed backwards, it is out of sync, and will need proper handling in the logic at each and every CPU change. For now, drop this test during init as misguided. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/vmx.c | 10 +++------- arch/x86/kvm/x86.c | 2 -- 3 files changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e01b72825564..6056a23dc4cf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -394,7 +394,6 @@ struct kvm_arch { gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; - u64 vm_init_tsc; s64 kvmclock_offset; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 751a2d29f4ce..4fbab2469bf9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2512,7 +2512,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat, tsc_this, tsc_base; + u64 host_pat, tsc_this; unsigned long a; struct desc_ptr dt; int i; @@ -2653,12 +2653,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); - tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; - rdtscll(tsc_this); - if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) - tsc_base = tsc_this; - - guest_write_tsc(0, tsc_base); + tsc_this = native_read_tsc(); + guest_write_tsc(0, tsc_this); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b4d2836240f..8b0c51a1adaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5495,8 +5495,6 @@ struct kvm *kvm_arch_create_vm(void) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - rdtscll(kvm->arch.vm_init_tsc); - return kvm; } -- cgit v1.2.3 From f4e1b3c8bd2a044cd0ccf80595bfd088a49fe60b Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:16 -1000 Subject: KVM: x86: Convert TSC writes to TSC offset writes Change svm / vmx to be the same internally and write TSC offset instead of bare TSC in helper functions. Isolated as a single patch to contain code movement. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 31 +++++++++++++++++-------------- arch/x86/kvm/vmx.c | 11 +++++------ 2 files changed, 22 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af5b9ea51965..e06f00d1f15c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -701,6 +701,20 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = offset; + } + + svm->vmcb->control.tsc_offset = offset + g_tsc_offset; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -901,7 +915,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - svm->vmcb->control.tsc_offset = 0-native_read_tsc(); + svm_write_tsc_offset(&svm->vcpu, 0-native_read_tsc()); err = fx_init(&svm->vcpu); if (err) @@ -2566,20 +2580,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TSC: { - u64 tsc_offset = data - native_read_tsc(); - u64 g_tsc_offset = 0; - - if (is_nested(svm)) { - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = tsc_offset; - } - - svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - + case MSR_IA32_TSC: + svm_write_tsc_offset(vcpu, data - native_read_tsc()); break; - } case MSR_STAR: svm->vmcb->save.star = data; break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4fbab2469bf9..d9bec5ee38b8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1149,9 +1149,9 @@ static u64 guest_read_tsc(void) * writes 'guest_tsc' into guest's timestamp counter "register" * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc */ -static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) +static void vmx_write_tsc_offset(u64 offset) { - vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); + vmcs_write64(TSC_OFFSET, offset); } /* @@ -1255,7 +1255,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; case MSR_IA32_TSC: rdtscll(host_tsc); - guest_write_tsc(data, host_tsc); + vmx_write_tsc_offset(data - host_tsc); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2512,7 +2512,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat, tsc_this; + u64 host_pat; unsigned long a; struct desc_ptr dt; int i; @@ -2653,8 +2653,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); - tsc_this = native_read_tsc(); - guest_write_tsc(0, tsc_this); + vmx_write_tsc_offset(0-native_read_tsc()); return 0; } -- cgit v1.2.3 From 99e3e30aee1a326a98bf3a5f47b8622219c685f3 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:17 -1000 Subject: KVM: x86: Move TSC offset writes to common code Also, ensure that the storing of the offset and the reading of the TSC are never preempted by taking a spinlock. While the lock is overkill now, it is useful later in this patch series. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 6 ++++-- arch/x86/kvm/vmx.c | 13 ++++++------- arch/x86/kvm/x86.c | 18 ++++++++++++++++++ arch/x86/kvm/x86.h | 2 ++ 5 files changed, 33 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6056a23dc4cf..a215153f1ff6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -395,6 +395,7 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; + spinlock_t tsc_write_lock; struct kvm_xen_hvm_config xen_hvm_config; @@ -521,6 +522,8 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + const struct trace_print_flags *exit_reasons_str; }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e06f00d1f15c..ea41c551fa44 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -915,7 +915,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - svm_write_tsc_offset(&svm->vcpu, 0-native_read_tsc()); + kvm_write_tsc(&svm->vcpu, 0); err = fx_init(&svm->vcpu); if (err) @@ -2581,7 +2581,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) switch (ecx) { case MSR_IA32_TSC: - svm_write_tsc_offset(vcpu, data - native_read_tsc()); + kvm_write_tsc(vcpu, data); break; case MSR_STAR: svm->vmcb->save.star = data; @@ -3551,6 +3551,8 @@ static struct kvm_x86_ops svm_x86_ops = { .set_supported_cpuid = svm_set_supported_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, + + .write_tsc_offset = svm_write_tsc_offset, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d9bec5ee38b8..138746d3afe9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1146,10 +1146,9 @@ static u64 guest_read_tsc(void) } /* - * writes 'guest_tsc' into guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + * writes 'offset' into guest's timestamp counter offset register */ -static void vmx_write_tsc_offset(u64 offset) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { vmcs_write64(TSC_OFFSET, offset); } @@ -1224,7 +1223,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; - u64 host_tsc; int ret = 0; switch (msr_index) { @@ -1254,8 +1252,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - rdtscll(host_tsc); - vmx_write_tsc_offset(data - host_tsc); + kvm_write_tsc(vcpu, data); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2653,7 +2650,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); - vmx_write_tsc_offset(0-native_read_tsc()); + kvm_write_tsc(&vmx->vcpu, 0); return 0; } @@ -4348,6 +4345,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_supported_cpuid = vmx_set_supported_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .write_tsc_offset = vmx_write_tsc_offset, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b0c51a1adaa..886132b6ef14 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -895,6 +895,22 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u64 offset; + unsigned long flags; + + spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = data - native_read_tsc(); + kvm_x86_ops->write_tsc_offset(vcpu, offset); + spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + /* Reset of TSC must disable overshoot protection below */ + vcpu->arch.hv_clock.tsc_timestamp = 0; +} +EXPORT_SYMBOL_GPL(kvm_write_tsc); + static void kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; @@ -5495,6 +5511,8 @@ struct kvm *kvm_arch_create_vm(void) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + spin_lock_init(&kvm->arch.tsc_write_lock); + return kvm; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b7a404722d2b..2d6385e44ccf 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -68,4 +68,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); + #endif -- cgit v1.2.3 From f38e098ff3a315bb74abbb4a35cba11bbea8e2fa Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:20 -1000 Subject: KVM: x86: TSC reset compensation Attempt to synchronize TSCs which are reset to the same value. In the case of a reliable hardware TSC, we can just re-use the same offset, but on non-reliable hardware, we can get closer by adjusting the offset to match the elapsed time. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/x86.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a215153f1ff6..57b4394491ec 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -396,6 +396,9 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; spinlock_t tsc_write_lock; + u64 last_tsc_nsec; + u64 last_tsc_offset; + u64 last_tsc_write; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 886132b6ef14..e7da14c317e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -898,11 +898,40 @@ static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; - u64 offset; + u64 offset, ns, elapsed; unsigned long flags; + struct timespec ts; spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); + ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + ns = timespec_to_ns(&ts); + elapsed = ns - kvm->arch.last_tsc_nsec; + + /* + * Special case: identical write to TSC within 5 seconds of + * another CPU is interpreted as an attempt to synchronize + * (the 5 seconds is to accomodate host load / swapping). + * + * In that case, for a reliable TSC, we can match TSC offsets, + * or make a best guest using kernel_ns value. + */ + if (data == kvm->arch.last_tsc_write && elapsed < 5ULL * NSEC_PER_SEC) { + if (!check_tsc_unstable()) { + offset = kvm->arch.last_tsc_offset; + pr_debug("kvm: matched tsc offset for %llu\n", data); + } else { + u64 tsc_delta = elapsed * __get_cpu_var(cpu_tsc_khz); + tsc_delta = tsc_delta / USEC_PER_SEC; + offset += tsc_delta; + pr_debug("kvm: adjusted tsc offset by %llu\n", tsc_delta); + } + ns = kvm->arch.last_tsc_nsec; + } + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = data; + kvm->arch.last_tsc_offset = offset; kvm_x86_ops->write_tsc_offset(vcpu, offset); spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); -- cgit v1.2.3 From 8cfdc0008542b57caadbfe013da163131a8293f4 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:21 -1000 Subject: KVM: x86: Make cpu_tsc_khz updates use local CPU This simplifies much of the init code; we can now simply always call tsc_khz_changed, optionally passing it a new value, or letting it figure out the existing value (while interrupts are disabled, and thus, by inference from the rule, not raceful against CPU hotplug or frequency updates, which will issue IPIs to the local CPU to perform this very same task). Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 157 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e7da14c317e6..699c6b89c1b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -895,6 +895,15 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +static inline int kvm_tsc_changes_freq(void) +{ + int cpu = get_cpu(); + int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + cpufreq_quick_get(cpu) != 0; + put_cpu(); + return ret; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; @@ -940,7 +949,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_write_tsc); -static void kvm_write_guest_time(struct kvm_vcpu *v) +static int kvm_write_guest_time(struct kvm_vcpu *v) { struct timespec ts; unsigned long flags; @@ -949,24 +958,27 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) unsigned long this_tsc_khz; if ((!vcpu->time_page)) - return; - - this_tsc_khz = get_cpu_var(cpu_tsc_khz); - if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; - } - put_cpu_var(cpu_tsc_khz); + return 0; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); ktime_get_ts(&ts); monotonic_to_bootbased(&ts); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); local_irq_restore(flags); - /* With all the info we got, fill in the values */ + if (unlikely(this_tsc_khz == 0)) { + kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); + return 1; + } + if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + vcpu->hv_clock_tsc_khz = this_tsc_khz; + } + + /* With all the info we got, fill in the values */ vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; @@ -987,6 +999,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + return 0; } static int kvm_request_guest_time_update(struct kvm_vcpu *v) @@ -1853,12 +1866,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { - unsigned long khz = cpufreq_quick_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; - } kvm_request_guest_time_update(vcpu); } @@ -4152,9 +4159,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void bounce_off(void *info) +static void tsc_bad(void *info) +{ + __get_cpu_var(cpu_tsc_khz) = 0; +} + +static void tsc_khz_changed(void *data) { - /* nothing */ + struct cpufreq_freqs *freq = data; + unsigned long khz = 0; + + if (data) + khz = freq->new; + else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + khz = cpufreq_quick_get(raw_smp_processor_id()); + if (!khz) + khz = tsc_khz; + __get_cpu_var(cpu_tsc_khz) = khz; } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -4165,11 +4186,51 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; + /* + * We allow guests to temporarily run on slowing clocks, + * provided we notify them after, or to run on accelerating + * clocks, provided we notify them before. Thus time never + * goes backwards. + * + * However, we have a problem. We can't atomically update + * the frequency of a given CPU from this function; it is + * merely a notifier, which can be called from any CPU. + * Changing the TSC frequency at arbitrary points in time + * requires a recomputation of local variables related to + * the TSC for each VCPU. We must flag these local variables + * to be updated and be sure the update takes place with the + * new frequency before any guests proceed. + * + * Unfortunately, the combination of hotplug CPU and frequency + * change creates an intractable locking scenario; the order + * of when these callouts happen is undefined with respect to + * CPU hotplug, and they can race with each other. As such, + * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is + * undefined; you can actually have a CPU frequency change take + * place in between the computation of X and the setting of the + * variable. To protect against this problem, all updates of + * the per_cpu tsc_khz variable are done in an interrupt + * protected IPI, and all callers wishing to update the value + * must wait for a synchronous IPI to complete (which is trivial + * if the caller is on the CPU already). This establishes the + * necessary total order on variable updates. + * + * Note that because a guest time update may take place + * anytime after the setting of the VCPU's request bit, the + * correct TSC value must be set before the request. However, + * to ensure the update actually makes it to any guest which + * starts running in hardware virtualization between the set + * and the acquisition of the spinlock, we must also ping the + * CPU after setting the request bit. + * + */ + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { @@ -4179,7 +4240,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va if (!kvm_request_guest_time_update(vcpu)) continue; if (vcpu->cpu != smp_processor_id()) - send_ipi++; + send_ipi = 1; } } spin_unlock(&kvm_lock); @@ -4197,32 +4258,48 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier + .notifier_call = kvmclock_cpufreq_notifier +}; + +static int kvmclock_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, tsc_bad, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvmclock_cpu_notifier_block = { + .notifier_call = kvmclock_cpu_notifier, + .priority = -INT_MAX }; static void kvm_timer_init(void) { int cpu; + register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - for_each_online_cpu(cpu) { - unsigned long khz = cpufreq_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; - } - } else { - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } + for_each_online_cpu(cpu) + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -4324,6 +4401,7 @@ void kvm_arch_exit(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -4739,8 +4817,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) - kvm_write_guest_time(vcpu); + if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) { + r = kvm_write_guest_time(vcpu); + if (unlikely(r)) + goto out; + } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) @@ -5423,17 +5504,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) int kvm_arch_hardware_enable(void *garbage) { - /* - * Since this may be called from a hotplug notifcation, - * we can't get the CPU frequency directly. - */ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - int cpu = raw_smp_processor_id(); - per_cpu(cpu_tsc_khz, cpu) = 0; - } - kvm_shared_msr_cpu_online(); - return kvm_x86_ops->hardware_enable(garbage); } -- cgit v1.2.3 From 6755bae8e69093b2994b6f29cd3eaecdf610374e Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:22 -1000 Subject: KVM: x86: Warn about unstable TSC If creating an SMP guest with unstable host TSC, issue a warning Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 699c6b89c1b4..a8dee58e8716 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5457,6 +5457,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + printk_once(KERN_WARNING + "kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); return kvm_x86_ops->vcpu_create(kvm, id); } -- cgit v1.2.3 From e48672fa25e879f7ae21785c7efd187738139593 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:23 -1000 Subject: KVM: x86: Unify TSC logic Move the TSC control logic from the vendor backends into x86.c by adding adjust_tsc_offset to x86 ops. Now all TSC decisions can be done in one place. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/svm.c | 26 ++++++++++---------------- arch/x86/kvm/vmx.c | 22 ++++++++-------------- arch/x86/kvm/x86.c | 17 ++++++++++++++--- 4 files changed, 35 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 57b4394491ec..5ab1c3fb34ef 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -255,7 +255,6 @@ struct kvm_mmu { }; struct kvm_vcpu_arch { - u64 host_tsc; /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -336,9 +335,10 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; - unsigned int hv_clock_tsc_khz; + unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + u64 last_host_tsc; bool nmi_pending; bool nmi_injected; @@ -520,6 +520,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ea41c551fa44..ff28f6521065 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -715,6 +715,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) svm->vmcb->control.tsc_offset = offset + g_tsc_offset; } +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.tsc_offset += adjustment; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += adjustment; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -961,20 +970,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int i; if (unlikely(cpu != vcpu->cpu)) { - u64 delta; - - if (check_tsc_unstable()) { - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; - } - vcpu->cpu = cpu; - kvm_migrate_timers(vcpu); svm->asid_generation = 0; } @@ -990,8 +985,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) ++vcpu->stat.host_state_reload; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - vcpu->arch.host_tsc = native_read_tsc(); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -3553,6 +3546,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, .write_tsc_offset = svm_write_tsc_offset, + .adjust_tsc_offset = svm_adjust_tsc_offset, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 138746d3afe9..275a81d571cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -505,7 +505,6 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.arch.host_tsc); list_del(&vmx->local_vcpus_link); vmx->vcpu.cpu = -1; vmx->launched = 0; @@ -881,7 +880,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tsc_this, delta, new_offset; u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); if (!vmm_exclusive) @@ -898,14 +896,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct desc_ptr *gdt = &__get_cpu_var(host_gdt); unsigned long sysenter_esp; - kvm_migrate_timers(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); local_irq_enable(); - vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. @@ -915,16 +911,6 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - /* - * Make sure the time stamp counter is monotonous. - */ - rdtscll(tsc_this); - if (tsc_this < vcpu->arch.host_tsc) { - delta = vcpu->arch.host_tsc - tsc_this; - new_offset = vmcs_read64(TSC_OFFSET) + delta; - vmcs_write64(TSC_OFFSET, new_offset); - } } } @@ -1153,6 +1139,12 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcs_write64(TSC_OFFSET, offset); } +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +{ + u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -4108,6 +4100,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); @@ -4347,6 +4340,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .write_tsc_offset = vmx_write_tsc_offset, + .adjust_tsc_offset = vmx_adjust_tsc_offset, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a8dee58e8716..468fafaed1ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -973,9 +973,9 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) return 1; } - if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; + vcpu->hw_tsc_khz = this_tsc_khz; } /* With all the info we got, fill in the values */ @@ -1866,13 +1866,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - kvm_request_guest_time_update(vcpu); + if (unlikely(vcpu->cpu != cpu)) { + /* Make sure TSC doesn't go backwards */ + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; + if (tsc_delta < 0) + mark_tsc_unstable("KVM discovered backwards TSC"); + if (check_tsc_unstable()) + kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + kvm_migrate_timers(vcpu); + vcpu->cpu = cpu; + } } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int is_efer_nx(void) -- cgit v1.2.3 From 48434c20e18d59001469699fcaaf9cf30b815a20 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:24 -1000 Subject: KVM: x86: Fix deep C-state TSC desynchronization When CPUs with unstable TSCs enter deep C-state, TSC may stop running. This causes us to require resynchronization. Since we can't tell when this may potentially happen, we assume the worst by forcing re-compensation for it at every point the VCPU task is descheduled. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 468fafaed1ae..9396b3f2c594 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1866,7 +1866,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(vcpu->cpu != cpu)) { + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { /* Make sure TSC doesn't go backwards */ s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : native_read_tsc() - vcpu->arch.last_host_tsc; -- cgit v1.2.3 From 759379dd68c2885d1fafa433083d4487e710a685 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:25 -1000 Subject: KVM: x86: Add helper functions for time computation Add a helper function to compute the kernel time and convert nanoseconds back to CPU specific cycles. Note that these must not be called in preemptible context, as that would mean the kernel could enter software suspend state, which would cause non-atomic operation. Also, convert the KVM_SET_CLOCK / KVM_GET_CLOCK ioctls to use the kernel time helper, these should be bootbased as well. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9396b3f2c594..4bcb120cc76a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -893,6 +893,16 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul); } +static inline u64 get_kernel_ns(void) +{ + struct timespec ts; + + WARN_ON(preemptible()); + ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + return timespec_to_ns(&ts); +} + static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static inline int kvm_tsc_changes_freq(void) @@ -904,18 +914,24 @@ static inline int kvm_tsc_changes_freq(void) return ret; } +static inline u64 nsec_to_cycles(u64 nsec) +{ + WARN_ON(preemptible()); + if (kvm_tsc_changes_freq()) + printk_once(KERN_WARNING + "kvm: unreliable cycle conversion on adjustable rate TSC\n"); + return (nsec * __get_cpu_var(cpu_tsc_khz)) / USEC_PER_SEC; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - struct timespec ts; spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); - ns = timespec_to_ns(&ts); + ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; /* @@ -931,10 +947,9 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { - u64 tsc_delta = elapsed * __get_cpu_var(cpu_tsc_khz); - tsc_delta = tsc_delta / USEC_PER_SEC; - offset += tsc_delta; - pr_debug("kvm: adjusted tsc offset by %llu\n", tsc_delta); + u64 delta = nsec_to_cycles(elapsed); + offset += delta; + pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } ns = kvm->arch.last_tsc_nsec; } @@ -951,11 +966,11 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static int kvm_write_guest_time(struct kvm_vcpu *v) { - struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns; if ((!vcpu->time_page)) return 0; @@ -963,8 +978,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); + kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); local_irq_restore(flags); @@ -979,9 +993,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) } /* With all the info we got, fill in the values */ - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; - + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->hv_clock.flags = 0; /* @@ -3263,7 +3275,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_SET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; s64 delta; @@ -3277,19 +3288,16 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + now_ns = get_kernel_ns(); delta = user_ns.clock - now_ns; kvm->arch.kvmclock_offset = delta; break; } case KVM_GET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + now_ns = get_kernel_ns(); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; user_ns.flags = 0; -- cgit v1.2.3 From 46543ba45fc4b64ca32655efdc8d9c599b4164e2 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:26 -1000 Subject: KVM: x86: Robust TSC compensation Make the match of TSC find TSC writes that are close to each other instead of perfectly identical; this allows the compensator to also work in migration / suspend scenarios. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4bcb120cc76a..4ff0c271f125 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -928,21 +928,27 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; + s64 sdiff; spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; + sdiff = data - kvm->arch.last_tsc_write; + if (sdiff < 0) + sdiff = -sdiff; /* - * Special case: identical write to TSC within 5 seconds of + * Special case: close write to TSC within 5 seconds of * another CPU is interpreted as an attempt to synchronize - * (the 5 seconds is to accomodate host load / swapping). + * The 5 seconds is to accomodate host load / swapping as + * well as any reset of TSC during the boot process. * * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using kernel_ns value. + * or make a best guest using elapsed value. */ - if (data == kvm->arch.last_tsc_write && elapsed < 5ULL * NSEC_PER_SEC) { + if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + elapsed < 5ULL * NSEC_PER_SEC) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); -- cgit v1.2.3 From ca84d1a24c376e0841f35db08dab7b829c8c0b1e Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:28 -1000 Subject: KVM: x86: Add clock sync request to hardware enable If there are active VCPUs which are marked as belonging to a particular hardware CPU, request a clock sync for them when enabling hardware; the TSC could be desynchronized on a newly arriving CPU, and we need to recompute guests system time relative to boot after a suspend event. This covers both cases. Note that it is acceptable to take the spinlock, as either no other tasks will be running and no locks held (BSP after resume), or other tasks will be guaranteed to drop the lock relatively quickly (AP on CPU_STARTING). Noting we now get clock synchronization requests for VCPUs which are starting up (or restarting), it is tempting to attempt to remove the arch/x86/kvm/x86.c CPU hot-notifiers at this time, however it is not correct to do so; they are required for systems with non-constant TSC as the frequency may not be known immediately after the processor has started until the cpufreq driver has had a chance to run and query the chipset. Updated: implement better locking semantics for hardware_enable Removed the hack of dropping and retaking the lock by adding the semantic that we always hold kvm_lock when hardware_enable is called. The one place that doesn't need to worry about it is resume, as resuming a frozen CPU, the spinlock won't be taken. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 8 ++++++++ virt/kvm/kvm_main.c | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ff0c271f125..d0764a258047 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5533,7 +5533,15 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) int kvm_arch_hardware_enable(void *garbage) { + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + kvm_shared_msr_cpu_online(); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->cpu == smp_processor_id()) + kvm_request_guest_time_update(vcpu); return kvm_x86_ops->hardware_enable(garbage); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5186e728c53e..da117a6b1e2e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1961,7 +1961,9 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, case CPU_STARTING: printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", cpu); + spin_lock(&kvm_lock); hardware_enable(NULL); + spin_unlock(&kvm_lock); break; } return NOTIFY_OK; @@ -2168,8 +2170,10 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state) static int kvm_resume(struct sys_device *dev) { - if (kvm_usage_count) + if (kvm_usage_count) { + WARN_ON(spin_is_locked(&kvm_lock)); hardware_enable(NULL); + } return 0; } -- cgit v1.2.3 From 347bb4448c2155eb2310923ccaa4be5677649003 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:29 -1000 Subject: x86: pvclock: Move scale_delta into common header The scale_delta function for shift / multiply with 31-bit precision moves to a common header so it can be used by both kernel and kvm module. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/pvclock.h | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/pvclock.c | 3 ++- 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index cd02f324aa6b..7f7e577a0e39 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 239427ca02af..bab3b9e6f66d 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); + return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, + shadow->tsc_shift); } /* -- cgit v1.2.3 From 1d5f066e0b63271b67eac6d3752f8aa96adcbddb Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Thu, 19 Aug 2010 22:07:30 -1000 Subject: KVM: x86: Fix a possible backwards warp of kvmclock Kernel time, which advances in discrete steps may progress much slower than TSC. As a result, when kvmclock is adjusted to a new base, the apparent time to the guest, which runs at a much higher, nsec scaled rate based on the current TSC, may have already been observed to have a larger value (kernel_ns + scaled tsc) than the value to which we are setting it (kernel_ns + 0). We must instead compute the clock as potentially observed by the guest for kernel_ns to make sure it does not go backwards. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5ab1c3fb34ef..789e9462668f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -339,6 +339,8 @@ struct kvm_vcpu_arch { unsigned int time_offset; struct page *time_page; u64 last_host_tsc; + u64 last_guest_tsc; + u64 last_kernel_ns; bool nmi_pending; bool nmi_injected; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0764a258047..d4d33f943d99 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -55,6 +55,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -976,14 +977,15 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; - s64 kernel_ns; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; if ((!vcpu->time_page)) return 0; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); local_irq_restore(flags); @@ -993,13 +995,49 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) return 1; } + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); vcpu->hw_tsc_khz = this_tsc_khz; } + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + /* With all the info we got, fill in the values */ + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; vcpu->hv_clock.flags = 0; /* @@ -4931,6 +4969,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + atomic_set(&vcpu->guest_mode, 0); smp_wmb(); local_irq_enable(); -- cgit v1.2.3 From 957ed9effd80b04482cbdce8c95bdf803a656b94 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 22 Aug 2010 19:12:48 +0800 Subject: KVM: MMU: prefetch ptes when intercepted guest #PF Support prefetch ptes when intercept guest #PF, avoid to #PF by later access If we meet any failure in the prefetch path, we will exit it and not try other ptes to avoid become heavy path Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 104 ++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/paging_tmpl.h | 72 +++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 54a50268cebf..b0037a77e56b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644); } #endif +#define PTE_PREFETCH_NUM 8 + #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 @@ -400,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 4); + rmap_desc_cache, 4 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -2089,6 +2091,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } +static struct kvm_memory_slot * +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + if (!slot) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + hva = gfn_to_hva_memslot(slot, gfn); + + return hva_to_pfn_atomic(vcpu->kvm, hva); +} + +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + struct page *pages[PTE_PREFETCH_NUM]; + unsigned access = sp->role.access; + int i, ret; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + return -1; + + ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + if (ret <= 0) + return -1; + + for (i = 0; i < ret; i++, gfn++, start++) + mmu_set_spte(vcpu, start, ACC_ALL, + access, 0, 0, 1, NULL, + sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); + + return 0; +} + +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *sptep) +{ + u64 *spte, *start = NULL; + int i; + + WARN_ON(!sp->role.direct); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { + if (!start) + continue; + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + break; + start = NULL; + } else if (!start) + start = spte; + } +} + +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) +{ + struct kvm_mmu_page *sp; + + /* + * Since it's no accessed bit on EPT, it's no way to + * distinguish between actually accessed translations + * and prefetched, so disable pte prefetch if EPT is + * enabled. + */ + if (!shadow_accessed_mask) + return; + + sp = page_header(__pa(sptep)); + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + __direct_pte_prefetch(vcpu, sp, sptep); +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int level, gfn_t gfn, pfn_t pfn) { @@ -2102,6 +2203,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, level, gfn, pfn, false, true); + direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 51ef9097960d..872ff265c91e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -310,6 +310,77 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, return r || curr_pte != gw->ptes[level - 1]; } +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep) +{ + struct kvm_mmu_page *sp; + pt_element_t gptep[PTE_PREFETCH_NUM]; + gpa_t first_pte_gpa; + int offset = 0, i; + u64 *spte; + + sp = page_header(__pa(sptep)); + + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + if (sp->role.direct) + return __direct_pte_prefetch(vcpu, sp, sptep); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + first_pte_gpa = gfn_to_gpa(sp->gfn) + + (offset + i) * sizeof(pt_element_t); + + if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep, + sizeof(gptep)) < 0) + return; + + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + pt_element_t gpte; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; + bool dirty; + + if (spte == sptep) + continue; + + if (*spte != shadow_trap_nonpresent_pte) + continue; + + gpte = gptep[i]; + + if (!is_present_gpte(gpte) || + is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) { + if (!sp->unsync) + __set_spte(spte, shadow_notrap_nonpresent_pte); + continue; + } + + if (!(gpte & PT_ACCESSED_MASK)) + continue; + + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + gfn = gpte_to_gfn(gpte); + dirty = is_dirty_gpte(gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + (pte_access & ACC_WRITE_MASK) && dirty); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + break; + } + + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, + dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, + pfn, true, true); + } +} + /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ @@ -391,6 +462,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, user_fault, write_fault, dirty, ptwrite, it.level, gw->gfn, pfn, false, true); + FNAME(pte_prefetch)(vcpu, it.sptep); return it.sptep; -- cgit v1.2.3 From 189be38db3dde12699a8b9dc22d33e8c95efe110 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sun, 22 Aug 2010 19:13:33 +0800 Subject: KVM: MMU: combine guest pte read between fetch and pte prefetch Combine guest pte read between guest pte check in the fetch path and pte prefetch Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/paging_tmpl.h | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 872ff265c91e..a4e8389df2ad 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -67,6 +67,7 @@ struct guest_walker { int level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; + pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; @@ -302,21 +303,33 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { - int r; pt_element_t curr_pte; - - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], + gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; + u64 mask; + int r, index; + + if (level == PT_PAGE_TABLE_LEVEL) { + mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; + base_gpa = pte_gpa & ~mask; + index = (pte_gpa - base_gpa) / sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); + curr_pte = gw->prefetch_ptes[index]; + } else + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &curr_pte, sizeof(curr_pte)); + return r || curr_pte != gw->ptes[level - 1]; } -static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep) +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + u64 *sptep) { struct kvm_mmu_page *sp; - pt_element_t gptep[PTE_PREFETCH_NUM]; - gpa_t first_pte_gpa; - int offset = 0, i; + pt_element_t *gptep = gw->prefetch_ptes; u64 *spte; + int i; sp = page_header(__pa(sptep)); @@ -327,17 +340,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep) return __direct_pte_prefetch(vcpu, sp, sptep); i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); - - if (PTTYPE == 32) - offset = sp->role.quadrant << PT64_LEVEL_BITS; - - first_pte_gpa = gfn_to_gpa(sp->gfn) + - (offset + i) * sizeof(pt_element_t); - - if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep, - sizeof(gptep)) < 0) - return; - spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { @@ -462,7 +464,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, user_fault, write_fault, dirty, ptwrite, it.level, gw->gfn, pfn, false, true); - FNAME(pte_prefetch)(vcpu, it.sptep); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); return it.sptep; -- cgit v1.2.3 From cc4feed57fcd4934b89aaac51d64dbff921e2f2b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 25 Aug 2010 14:10:53 +0800 Subject: KVM: x86 emulator: add CALL FAR instruction emulation (opcode 9a) Signed-off-by: Wei Yongjun Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 312e798d5425..1702ea8a28c6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2455,7 +2455,7 @@ static struct opcode opcode_table[256] = { X8(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - D(SrcImmFAddr | No64), N, + I(SrcImmFAddr | No64, em_call_far), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, /* 0xA0 - 0xA7 */ D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), -- cgit v1.2.3 From 6e2fb2cadd9a523ff5494d4c4d53c0d3e0024691 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 25 Aug 2010 12:47:41 +0300 Subject: KVM: x86 emulator: Rename variable that shadows another local variable. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1702ea8a28c6..42d42ca2c37b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3421,7 +3421,7 @@ writeback: &c->dst); if (c->rep_prefix && (c->d & String)) { - struct read_cache *rc = &ctxt->decode.io_read; + struct read_cache *r = &ctxt->decode.io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); /* The second termination condition only applies for REPE * and REPNE. Test if the repeat string operation prefix is @@ -3441,8 +3441,8 @@ writeback: * Re-enter guest when pio read ahead buffer is empty or, * if it is not used, after each 1024 iteration. */ - else if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || - (rc->end != 0 && rc->end == rc->pos)) { + else if ((r->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || + (r->end != 0 && r->end == r->pos)) { ctxt->restart = false; c->eip = ctxt->eip; } -- cgit v1.2.3 From 3e2f65d57a0c1897fcc3287eeb41f117f4d021e5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 25 Aug 2010 12:47:42 +0300 Subject: KVM: x86 emulator: move string instruction completion check into separate function Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 42d42ca2c37b..3dcbc1d0a59d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2933,6 +2933,28 @@ done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } +static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if (((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) + && (((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) + || ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + return true; + + return false; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { @@ -3423,19 +3445,8 @@ writeback: if (c->rep_prefix && (c->d & String)) { struct read_cache *r = &ctxt->decode.io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if (((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) - && (((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) - || ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + + if (string_insn_completed(ctxt)) ctxt->restart = false; /* * Re-enter guest when pio read ahead buffer is empty or, -- cgit v1.2.3 From d2ddd1c48364e4161052d6089f06b2cf3c50496b Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 25 Aug 2010 12:47:43 +0300 Subject: KVM: x86 emulator: get rid of "restart" in emulation context. x86_emulate_insn() will return 1 if instruction can be restarted without re-entering a guest. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 4 +++- arch/x86/kvm/emulate.c | 43 +++++++++++++++++--------------------- arch/x86/kvm/x86.c | 16 +++++++------- 3 files changed, 30 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1bbf2b6f2a7e..1bf11400ae99 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -224,7 +224,6 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - bool restart; /* restart string instruction after writeback */ bool perm_ok; /* do not check permissions if true */ int exception; /* exception that happens during emulation or -1 */ @@ -255,6 +254,9 @@ struct x86_emulate_ctxt { #endif int x86_decode_insn(struct x86_emulate_ctxt *ctxt); +#define EMULATION_FAILED -1 +#define EMULATION_OK 0 +#define EMULATION_RESTART 1 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3dcbc1d0a59d..ec35a71d8b5d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -437,7 +437,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, ctxt->exception = vec; ctxt->error_code = error; ctxt->error_code_valid = valid; - ctxt->restart = false; } static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) @@ -2633,9 +2632,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt) struct opcode opcode, *g_mod012, *g_mod3; struct operand memop = { .type = OP_NONE }; - /* we cannot decode insn before we complete previous rep insn */ - WARN_ON(ctxt->restart); - c->eip = ctxt->eip; c->fetch.start = c->fetch.end = c->eip; ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); @@ -2985,10 +2981,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if (c->rep_prefix && (c->d & String)) { - ctxt->restart = true; /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - ctxt->restart = false; ctxt->eip = c->eip; goto done; } @@ -3446,28 +3440,29 @@ writeback: struct read_cache *r = &ctxt->decode.io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - if (string_insn_completed(ctxt)) - ctxt->restart = false; - /* - * Re-enter guest when pio read ahead buffer is empty or, - * if it is not used, after each 1024 iteration. - */ - else if ((r->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || - (r->end != 0 && r->end == r->pos)) { - ctxt->restart = false; - c->eip = ctxt->eip; + if (!string_insn_completed(ctxt)) { + /* + * Re-enter guest when pio read ahead buffer is empty + * or, if it is not used, after each 1024 iteration. + */ + if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && + (r->end == 0 || r->end != r->pos)) { + /* + * Reset read cache. Usually happens before + * decode, but since instruction is restarted + * we have to do it here. + */ + ctxt->decode.mem_read.end = 0; + return EMULATION_RESTART; + } + goto done; /* skip rip writeback */ } } - /* - * reset read cache here in case string instruction is restared - * without decoding - */ - ctxt->decode.mem_read.end = 0; - if (!ctxt->restart) - ctxt->eip = c->eip; + + ctxt->eip = c->eip; done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (c->b) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d4d33f943d99..bc96ac9ed912 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4181,18 +4181,17 @@ int emulate_instruction(struct kvm_vcpu *vcpu, restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); - if (r) { /* emulation failed */ + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; return handle_emulation_failure(vcpu); } - r = EMULATE_DONE; - - if (vcpu->arch.emulate_ctxt.exception >= 0) + if (vcpu->arch.emulate_ctxt.exception >= 0) { inject_emulated_exception(vcpu); - else if (vcpu->arch.pio.count) { + r = EMULATE_DONE; + } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; r = EMULATE_DO_MMIO; @@ -4200,8 +4199,10 @@ restart: if (vcpu->mmio_is_write) vcpu->mmio_needed = 0; r = EMULATE_DO_MMIO; - } else if (vcpu->arch.emulate_ctxt.restart) + } else if (r == EMULATION_RESTART) goto restart; + else + r = EMULATE_DONE; toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); @@ -5100,8 +5101,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.count || vcpu->mmio_needed || - vcpu->arch.emulate_ctxt.restart) { + if (vcpu->arch.pio.count || vcpu->mmio_needed) { if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; -- cgit v1.2.3 From 081bca0e6b87d0c7b9ade7ffee1f44aca336a8fa Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:06:15 +0300 Subject: KVM: x86 emulator: refuse SrcMemFAddr (e.g. LDS) with register operand SrcMemFAddr is not defined with the modrm operand designating a register instead of a memory address. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ec35a71d8b5d..2b9b0feabdba 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2974,6 +2974,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { + emulate_ud(ctxt); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { emulate_gp(ctxt, 0); -- cgit v1.2.3 From 8d8f4e9f66ab36e4fcc75eca1e828af8466309f1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:06 +0300 Subject: KVM: x86 emulator: support byte/word opcode pairs Many x86 instructions come in byte and word variants distinguished with bit 0 of the opcode. Add macros to aid in defining them. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2b9b0feabdba..1a230b5495e0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2330,6 +2330,9 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) #define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define D2bv(_f) D((_f) | ByteOp), D(_f) +#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) + static struct opcode group1[] = { X7(D(Lock)), N }; @@ -2572,6 +2575,9 @@ static struct opcode twobyte_table[256] = { #undef GD #undef I +#undef D2bv +#undef I2bv + static unsigned imm_size(struct decode_cache *c) { unsigned size; -- cgit v1.2.3 From 5315fbb223086c078c979d16734844ccff12f087 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:07 +0300 Subject: KVM: x86 emulator: simplify ALU block (opcodes 00-3F) decode flags Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 1a230b5495e0..277e667a382f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2385,42 +2385,34 @@ static struct group_dual group9 = { { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), N, N, /* 0x28 - 0x2F */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImmByte), D(DstAcc | SrcImm), N, N, + D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), N, N, /* 0x38 - 0x3F */ - D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), - D(ByteOp | DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), - D(ByteOp | DstAcc | SrcImm), D(DstAcc | SrcImm), + D2bv(DstMem | SrcReg | ModRM), D2bv(DstReg | SrcMem | ModRM), + D2bv(DstAcc | SrcImm), N, N, /* 0x40 - 0x4F */ X16(D(DstReg)), -- cgit v1.2.3 From 48fe67b5f7f71bb954dc97b18096cef12f6618b4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:08 +0300 Subject: KVM: x86 emulator: simplify string instruction decode flags Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 277e667a382f..749322e1d957 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2429,8 +2429,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D(DstDI | ByteOp | Mov | String), D(DstDI | Mov | String), /* insb, insw/insd */ - D(SrcSI | ByteOp | ImplicitOps | String), D(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + D2bv(DstDI | Mov | String), /* insb, insw/insd */ + D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -2454,13 +2454,12 @@ static struct opcode opcode_table[256] = { /* 0xA0 - 0xA7 */ D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs), - D(ByteOp | SrcSI | DstDI | Mov | String), D(SrcSI | DstDI | Mov | String), - D(ByteOp | SrcSI | DstDI | String), D(SrcSI | DstDI | String), + D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String), /* 0xA8 - 0xAF */ D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), - D(ByteOp | SrcAcc | DstDI | Mov | String), D(SrcAcc | DstDI | Mov | String), - D(ByteOp | SrcSI | DstAcc | Mov | String), D(SrcSI | DstAcc | Mov | String), - D(ByteOp | SrcAcc | DstDI | String), D(SrcAcc | DstDI | String), + D2bv(SrcAcc | DstDI | Mov | String), + D2bv(SrcSI | DstAcc | Mov | String), + D2bv(SrcAcc | DstDI | String), /* 0xB0 - 0xB7 */ X8(D(ByteOp | DstReg | SrcImm | Mov)), /* 0xB8 - 0xBF */ -- cgit v1.2.3 From 76e8e68d4435bb894a1a03be853a55a4a2b45247 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:09 +0300 Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes 80-8F Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 749322e1d957..661013fdb3b6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2438,11 +2438,10 @@ static struct opcode opcode_table[256] = { G(DstMem | SrcImm | ModRM | Group, group1), G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), G(DstMem | SrcImmByte | ModRM | Group, group1), - D(ByteOp | DstMem | SrcReg | ModRM), D(DstMem | SrcReg | ModRM), - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), /* 0x88 - 0x8F */ - D(ByteOp | DstMem | SrcReg | ModRM | Mov), D(DstMem | SrcReg | ModRM | Mov), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem | ModRM | Mov), + D2bv(DstMem | SrcReg | ModRM | Mov), + D2bv(DstReg | SrcMem | ModRM | Mov), D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ -- cgit v1.2.3 From 50748613d16f55cbf7da14bc6e92b7cb1cd4fa7d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:10 +0300 Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes A0-AF Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 661013fdb3b6..d59e54bb5890 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2451,11 +2451,11 @@ static struct opcode opcode_table[256] = { I(SrcImmFAddr | No64, em_call_far), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, /* 0xA0 - 0xA7 */ - D(ByteOp | DstAcc | SrcMem | Mov | MemAbs), D(DstAcc | SrcMem | Mov | MemAbs), - D(ByteOp | DstMem | SrcAcc | Mov | MemAbs), D(DstMem | SrcAcc | Mov | MemAbs), + D2bv(DstAcc | SrcMem | Mov | MemAbs), + D2bv(DstMem | SrcAcc | Mov | MemAbs), D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String), /* 0xA8 - 0xAF */ - D(DstAcc | SrcImmByte | ByteOp), D(DstAcc | SrcImm), + D2bv(DstAcc | SrcImm), D2bv(SrcAcc | DstDI | Mov | String), D2bv(SrcSI | DstAcc | Mov | String), D2bv(SrcAcc | DstDI | String), -- cgit v1.2.3 From d2c6c7adb181eac5b18dbefdf24c0e6745470939 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:11 +0300 Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes C0-DF Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d59e54bb5890..02566c1283f6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2464,17 +2464,16 @@ static struct opcode opcode_table[256] = { /* 0xB8 - 0xBF */ X8(D(DstReg | SrcImm | Mov)), /* 0xC0 - 0xC7 */ - D(ByteOp | DstMem | SrcImm | ModRM), D(DstMem | SrcImmByte | ModRM), + D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), D(ImplicitOps | Stack), D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), - D(ByteOp | DstMem | SrcImm | ModRM | Mov), D(DstMem | SrcImm | ModRM | Mov), + D2bv(DstMem | SrcImm | ModRM | Mov), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), /* 0xD0 - 0xD7 */ - D(ByteOp | DstMem | SrcOne | ModRM), D(DstMem | SrcOne | ModRM), - D(ByteOp | DstMem | ModRM), D(DstMem | ModRM), + D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), N, N, N, N, /* 0xD8 - 0xDF */ N, N, N, N, N, N, N, N, -- cgit v1.2.3 From d269e3961a65bbf6a76a8dc37b70cb578216e2c0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:12 +0300 Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes E0-FF Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 02566c1283f6..b43572afce3c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2479,13 +2479,11 @@ static struct opcode opcode_table[256] = { N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ X4(D(SrcImmByte)), - D(ByteOp | SrcImmUByte | DstAcc), D(SrcImmUByte | DstAcc), - D(ByteOp | SrcAcc | DstImmUByte), D(SrcAcc | DstImmUByte), + D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D(SrcNone | ByteOp | DstAcc), D(SrcNone | DstAcc), - D(ByteOp | SrcAcc | ImplicitOps), D(SrcAcc | ImplicitOps), + D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), /* 0xF0 - 0xF7 */ N, N, N, N, D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), -- cgit v1.2.3 From 739ae406068211b235b488f247aab349e486c382 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:56:13 +0300 Subject: KVM: x86 emulator: simplify instruction decode flags for opcodes 0F 00-FF Use the new byte/word dual opcode decode. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b43572afce3c..58e715cb5172 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2534,7 +2534,7 @@ static struct opcode twobyte_table[256] = { D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D2bv(DstMem | SrcReg | ModRM | Lock), D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), @@ -2544,7 +2544,7 @@ static struct opcode twobyte_table[256] = { D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ - D(ByteOp | DstMem | SrcReg | ModRM | Lock), D(DstMem | SrcReg | ModRM | Lock), + D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), N, N, N, N, N, N, N, N, -- cgit v1.2.3 From f6b3597bded9ed261b42fdcb5e741489cb5ccbfe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:59:00 +0300 Subject: KVM: x86 emulator: add macros for executing instructions that may trap Like DIV and IDIV. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 58e715cb5172..e96cce170228 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -331,6 +331,27 @@ struct group_dual { "a" (_rax), "d" (_rdx)); \ } while (0) +#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "1") \ + "1: \n\t" \ + _op _suffix " %6; " \ + "2: \n\t" \ + _POST_EFLAGS("0", "5", "1") \ + ".pushsection .fixup,\"ax\" \n\t" \ + "3: movb $1, %4 \n\t" \ + "jmp 2b \n\t" \ + ".popsection \n\t" \ + _ASM_EXTABLE(1b, 3b) \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ #define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ do { \ @@ -342,6 +363,28 @@ struct group_dual { } \ } while (0) +#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "b", _ex); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "w", _ex); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "l", _ex); \ + break; \ + case 8: ON64( \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "q", _ex)); \ + break; \ + } \ + } while (0) + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ -- cgit v1.2.3 From 34d1f4905eb66478a890ea808ec58bc842e6e589 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 11:59:01 +0300 Subject: KVM: x86 emulator: trap and propagate #DE from DIV and IDIV Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e96cce170228..917b9b50fab0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -504,6 +504,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) emulate_exception(ctxt, TS_VECTOR, err, true); } +static int emulate_de(struct x86_emulate_ctxt *ctxt) +{ + emulate_exception(ctxt, DE_VECTOR, 0, false); + return X86EMUL_PROPAGATE_FAULT; +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -1458,6 +1464,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; unsigned long *rax = &c->regs[VCPU_REGS_RAX]; unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; + u8 de = 0; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1476,14 +1483,18 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); break; case 6: /* div */ - emulate_1op_rax_rdx("div", c->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, + ctxt->eflags, de); break; case 7: /* idiv */ - emulate_1op_rax_rdx("idiv", c->src, *rax, *rdx, ctxt->eflags); + emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, + ctxt->eflags, de); break; default: return X86EMUL_UNHANDLEABLE; } + if (de) + return emulate_de(ctxt); return X86EMUL_CONTINUE; } @@ -3413,8 +3424,9 @@ special_insn: ctxt->eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ - if (emulate_grp3(ctxt, ops) != X86EMUL_CONTINUE) - goto cannot_emulate; + rc = emulate_grp3(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; -- cgit v1.2.3 From 217fc9cfca21a0bc2f4246183ebd8ee9863b019d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 13:38:03 +0300 Subject: KVM: Fix build error due to 64-bit division in nsec_to_cycles() Use do_div() instead. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bc96ac9ed912..bdba1d09a97e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -56,6 +56,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -917,11 +918,15 @@ static inline int kvm_tsc_changes_freq(void) static inline u64 nsec_to_cycles(u64 nsec) { + u64 ret; + WARN_ON(preemptible()); if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - return (nsec * __get_cpu_var(cpu_tsc_khz)) / USEC_PER_SEC; + ret = nsec * __get_cpu_var(cpu_tsc_khz); + do_div(ret, USEC_PER_SEC); + return ret; } void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) -- cgit v1.2.3 From 6230f7fc0453c5bc5daa8e053773021e1c4a2f16 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 26 Aug 2010 18:34:55 +0300 Subject: KVM: x86 emulator: simplify ALU opcode block decode further The ALU opcode block is very regular; introduce D6ALU() to define decode flags for 6 instructions at a time. Suggested by Paolo Bonzini. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 917b9b50fab0..8bfa3e3aa71d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2387,6 +2387,11 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) #define D2bv(_f) D((_f) | ByteOp), D(_f) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) +#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ + D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ + D2bv(((_f) & ~Lock) | DstAcc | SrcImm) + + static struct opcode group1[] = { X7(D(Lock)), N }; @@ -2439,35 +2444,25 @@ static struct group_dual group9 = { { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), + D6ALU(Lock), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), + D6ALU(Lock), D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), + D6ALU(Lock), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), + D6ALU(Lock), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), N, N, + D6ALU(Lock), N, N, /* 0x28 - 0x2F */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), - N, I(ByteOp | DstAcc | No64, em_das), + D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - D2bv(DstMem | SrcReg | ModRM | Lock), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), N, N, + D6ALU(Lock), N, N, /* 0x38 - 0x3F */ - D2bv(DstMem | SrcReg | ModRM), D2bv(DstReg | SrcMem | ModRM), - D2bv(DstAcc | SrcImm), - N, N, + D6ALU(0), N, N, /* 0x40 - 0x4F */ X16(D(DstReg)), /* 0x50 - 0x57 */ @@ -2618,6 +2613,7 @@ static struct opcode twobyte_table[256] = { #undef D2bv #undef I2bv +#undef D6ALU static unsigned imm_size(struct decode_cache *c) { -- cgit v1.2.3 From 23e7a7944f3779155e2f6bbc831b544eb925f387 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 27 Aug 2010 17:15:06 +0800 Subject: KVM: pit: Do not check pending pit timer in vcpu thread Pit interrupt injection was done by workqueue, so no need to check pending pit timer in vcpu thread which could lead unnecessary unblocking of vcpu. Signed-off-by: Jason Wang Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 9 --------- arch/x86/kvm/irq.c | 7 +------ 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index ddeb2314b522..2ad40a4ddc34 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -int pit_has_pending_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - - if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) - return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; -} - static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 2095a049835e..f994da40ad94 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -33,12 +33,7 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - int ret; - - ret = pit_has_pending_timer(vcpu); - ret |= apic_has_pending_timer(vcpu); - - return ret; + return apic_has_pending_timer(vcpu); } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); -- cgit v1.2.3 From 9ad17b10011702cb56c5e32e41ecd5fe281c3574 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 28 Aug 2010 19:19:42 +0800 Subject: KVM: MMU: fix compile warning in audit code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kvm/mmu.c: In function ‘kvm_mmu_unprotect_page’: arch/x86/kvm/mmu.c:1741: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’ arch/x86/kvm/mmu.c:1745: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’ arch/x86/kvm/mmu.c: In function ‘mmu_unshadow’: arch/x86/kvm/mmu.c:1761: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’ arch/x86/kvm/mmu.c: In function ‘set_spte’: arch/x86/kvm/mmu.c:2005: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 3 has type ‘gfn_t’ arch/x86/kvm/mmu.c: In function ‘mmu_set_spte’: arch/x86/kvm/mmu.c:2033: warning: format ‘%lx’ expects type ‘long unsigned int’, but argument 7 has type ‘gfn_t’ Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b0037a77e56b..59bf1d9553a7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1738,11 +1738,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); int r; - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); + pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); @@ -1758,7 +1758,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: zap %lx %x\n", + pgprintk("%s: zap %llx %x\n", __func__, gfn, sp->role.word); kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } @@ -2002,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %lx, marking ro\n", + pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; @@ -2031,7 +2031,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", + " user_fault %d gfn %llx\n", __func__, *sptep, pt_access, write_fault, user_fault, gfn); @@ -2050,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %lx new %lx\n", + pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); @@ -2067,7 +2067,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, *sptep, sptep); @@ -3651,9 +3651,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) if (!gfn_to_memslot(kvm, gfn)) { if (!printk_ratelimit()) return; - printk(KERN_ERR "%s: no memslot for gfn %ld\n", + printk(KERN_ERR "%s: no memslot for gfn %llx\n", audit_msg, gfn); - printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", + printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n", audit_msg, (long int)(sptep - rev_sp->spt), rev_sp->gfn); dump_stack(); @@ -3728,7 +3728,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) while (spte) { if (is_writable_pte(*spte)) printk(KERN_ERR "%s: (%s) shadow page has " - "writable mappings: gfn %lx role %x\n", + "writable mappings: gfn %llx role %x\n", __func__, audit_msg, sp->gfn, sp->role.word); spte = rmap_next(vcpu->kvm, rmapp, spte); -- cgit v1.2.3 From 0beb8d660425aab339ff68e6f4d4528739e8fc4f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 28 Aug 2010 19:20:47 +0800 Subject: KVM: MMU: check rmap for every spte The read-only spte also has reverse mapping, so fix the code to check them, also modify the function name to fit its doing Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 57 +++++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 59bf1d9553a7..1c784b96dac3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3644,40 +3644,38 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_mmu_page *rev_sp; gfn_t gfn; - if (is_writable_pte(*sptep)) { - rev_sp = page_header(__pa(sptep)); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no memslot for gfn %llx\n", - audit_msg, gfn); - printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n", - audit_msg, (long int)(sptep - rev_sp->spt), - rev_sp->gfn); - dump_stack(); - return; - } + rev_sp = page_header(__pa(sptep)); + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); - if (!*rmapp) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no rmap for writable spte %llx\n", - audit_msg, *sptep); - dump_stack(); - } + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %llx\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n", + audit_msg, (long int)(sptep - rev_sp->spt), + rev_sp->gfn); + dump_stack(); + return; } + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } } -void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) +void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu) { mmu_spte_walk(vcpu, inspect_spte_has_rmap); } -static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) +static void check_mappings_rmap(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; int i; @@ -3689,12 +3687,9 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) continue; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - if (!is_writable_pte(ent)) + if (!is_rmap_spte(pt[i])) continue; + inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } } @@ -3703,7 +3698,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) static void audit_rmap(struct kvm_vcpu *vcpu) { - check_writable_mappings_rmap(vcpu); + check_mappings_rmap(vcpu); count_rmaps(vcpu); } @@ -3746,7 +3741,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) audit_write_protection(vcpu); if (strcmp("pre pte write", audit_msg) != 0) audit_mappings(vcpu); - audit_writable_sptes_have_rmaps(vcpu); + audit_sptes_have_rmaps(vcpu); dbg = olddbg; } -- cgit v1.2.3 From bc32ce2152406431acf4daf4a81dc1664bb7b91b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 28 Aug 2010 19:22:46 +0800 Subject: KVM: MMU: fix wrong not write protected sp report The audit code reports some sp not write protected in current code, it's just the bug in audit_write_protection(), since: - the invalid sp not need write protected - using uninitialize local variable('gfn') - call kvm_mmu_audit() out of mmu_lock's protection Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- arch/x86/kvm/paging_tmpl.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1c784b96dac3..68575dc32ec7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3708,16 +3708,17 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) struct kvm_memory_slot *slot; unsigned long *rmapp; u64 *spte; - gfn_t gfn; list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { if (sp->role.direct) continue; if (sp->unsync) continue; + if (sp->role.invalid) + continue; slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[gfn - slot->base_gfn]; + rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; spte = rmap_next(vcpu->kvm, rmapp, NULL); while (spte) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a4e8389df2ad..a0f2febf5692 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -504,7 +504,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - kvm_mmu_audit(vcpu, "pre page fault"); r = mmu_topup_memory_caches(vcpu); if (r) @@ -542,6 +541,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; + + kvm_mmu_audit(vcpu, "pre page fault"); kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); -- cgit v1.2.3 From 365fb3fdf6769d3553999d8eb6cc2a8c56c747c1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 28 Aug 2010 19:24:13 +0800 Subject: KVM: MMU: rewrite audit_mappings_page() function There is a bugs in this function, we call gfn_to_pfn() and kvm_mmu_gva_to_gpa_read() in atomic context(kvm_mmu_audit() is called under the spinlock(mmu_lock)'s protection). This patch fix it by: - introduce gfn_to_pfn_atomic instead of gfn_to_pfn - get the mapping gfn from kvm_mmu_page_get_gfn() And it adds 'notrap' ptes check in unsync/direct sps Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 75 ++++++++++++++++++++++++++---------------------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 15 ++++++++-- 3 files changed, 54 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 68575dc32ec7..0d91f60af1a8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3487,15 +3487,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); static const char *audit_msg; -static gva_t canonicalize(gva_t gva) -{ -#ifdef CONFIG_X86_64 - gva = (long long)(gva << 16) >> 16; -#endif - return gva; -} - - typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, @@ -3550,39 +3541,53 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 ent = pt[i]; + u64 *sptep = pt + i; + struct kvm_mmu_page *sp; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; - if (ent == shadow_trap_nonpresent_pte) - continue; + sp = page_header(__pa(sptep)); - va = canonicalize(va); - if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) - audit_mappings_page(vcpu, ent, va, level - 1); - else { - gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); - gfn_t gfn = gpa >> PAGE_SHIFT; - pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); - hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; + if (sp->unsync) { + if (level != PT_PAGE_TABLE_LEVEL) { + printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n", + audit_msg, sp, level); + return; + } - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - continue; + if (*sptep == shadow_notrap_nonpresent_pte) { + printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n", + audit_msg, sp); + return; } + } - if (is_shadow_present_pte(ent) - && (ent & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx gpa %llx hpa %llx ent %llx %d\n", - audit_msg, vcpu->arch.mmu.root_level, - va, gpa, hpa, ent, - is_shadow_present_pte(ent)); - else if (ent == shadow_notrap_nonpresent_pte - && !is_error_hpa(hpa)) - printk(KERN_ERR "audit: (%s) notrap shadow," - " valid guest gva %lx\n", audit_msg, va); - kvm_release_pfn_clean(pfn); + if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { + printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n", + audit_msg, sp); + return; + } + + if (!is_shadow_present_pte(*sptep) || + !is_last_spte(*sptep, level)) + return; + + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; } + + hpa = pfn << PAGE_SHIFT; + + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + " gva %lx pfn %llx hpa %llx ent %llxn", + audit_msg, vcpu->arch.mmu.root_level, + va, pfn, hpa, *sptep); } } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b837ec80885d..f2ecdd52032b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -300,6 +300,7 @@ void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2eb0b7500a2a..c7a57b4feb39 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -999,7 +999,7 @@ pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) } EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) { unsigned long addr; @@ -1009,7 +1009,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) return page_to_pfn(bad_page); } - return hva_to_pfn(kvm, addr, false); + return hva_to_pfn(kvm, addr, atomic); +} + +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_pfn(kvm, gfn, true); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); + +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_pfn(kvm, gfn, false); } EXPORT_SYMBOL_GPL(gfn_to_pfn); -- cgit v1.2.3 From 8e0e8afa82018a3c751ea474eb47dfc65f00f4c3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 28 Aug 2010 19:25:09 +0800 Subject: KVM: MMU: remove count_rmaps() Nothing is checked in count_rmaps(), so remove it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0d91f60af1a8..0bff4d54817e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3606,43 +3606,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu) 2); } -static int count_rmaps(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_memslots *slots; - int nmaps = 0; - int i, j, k, idx; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &slots->memslots[i]; - struct kvm_rmap_desc *d; - - for (j = 0; j < m->npages; ++j) { - unsigned long *rmapp = &m->rmap[j]; - - if (!*rmapp) - continue; - if (!(*rmapp & 1)) { - ++nmaps; - continue; - } - d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (d) { - for (k = 0; k < RMAP_EXT; ++k) - if (d->sptes[k]) - ++nmaps; - else - break; - d = d->more; - } - } - } - srcu_read_unlock(&kvm->srcu, idx); - return nmaps; -} - void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { unsigned long *rmapp; @@ -3704,7 +3667,6 @@ static void check_mappings_rmap(struct kvm_vcpu *vcpu) static void audit_rmap(struct kvm_vcpu *vcpu) { check_mappings_rmap(vcpu); - count_rmaps(vcpu); } static void audit_write_protection(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From c41a15dd4632499b9c1a00871e160276999767d9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Aug 2010 10:46:56 +0300 Subject: KVM: Fix pio trace direction out = write, in = read, not the other way round. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bdba1d09a97e..d0ba857cd7cf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3743,7 +3743,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, if (vcpu->arch.pio.count) goto data_avail; - trace_kvm_pio(1, port, size, 1); + trace_kvm_pio(0, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 1; @@ -3771,7 +3771,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu) { - trace_kvm_pio(0, port, size, 1); + trace_kvm_pio(1, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 0; -- cgit v1.2.3 From 678041ad9dc82eedc598f709e8a3d620139d4105 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 31 Aug 2010 19:13:13 -0300 Subject: KVM: SVM: reset mmu context in init_vmcb Since commit aad827034e419fa no mmu reinitialization is performed via init_vmcb. Zero vcpu->arch.cr0 and pass the reset value as a parameter to kvm_set_cr0. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff28f6521065..60bc1e53d237 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -827,8 +827,8 @@ static void init_vmcb(struct vcpu_svm *svm) * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + svm->vcpu.arch.cr0 = 0; + (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ -- cgit v1.2.3 From eaa48512ba9df32aab8be5fceec10f3f80369379 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 31 Aug 2010 19:13:14 -0300 Subject: KVM: SVM: init_vmcb should reset vcpu->efer Otherwise EFER_LMA bit is retained across a SIPI reset. Fixes guest cpu onlining. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 60bc1e53d237..a1a83b955ed7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -816,7 +816,7 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - save->efer = EFER_SVME; + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; -- cgit v1.2.3 From e90aa41e6ca76cd7be021d4d5560e64954cd4585 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 1 Sep 2010 10:23:35 +0300 Subject: KVM: Don't save/restore MSR_IA32_PERF_STATUS It is read/only; restoring it only results in annoying messages. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0ba857cd7cf..1c972382e5d4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -739,7 +739,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; -- cgit v1.2.3 From b9eac5f4d146dc6cb88c8e6d891f8abe60493338 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 3 Aug 2010 14:46:56 +0300 Subject: KVM: x86 emulator: use single stage decoding for mov instructions Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8bfa3e3aa71d..c0715ae05a54 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2378,6 +2378,13 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_mov(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + c->dst.val = c->src.val; + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } #define N D(0) #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } @@ -2489,8 +2496,8 @@ static struct opcode opcode_table[256] = { G(DstMem | SrcImmByte | ModRM | Group, group1), D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), /* 0x88 - 0x8F */ - D2bv(DstMem | SrcReg | ModRM | Mov), - D2bv(DstReg | SrcMem | ModRM | Mov), + I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), + I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ @@ -2500,24 +2507,25 @@ static struct opcode opcode_table[256] = { I(SrcImmFAddr | No64, em_call_far), N, D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, /* 0xA0 - 0xA7 */ - D2bv(DstAcc | SrcMem | Mov | MemAbs), - D2bv(DstMem | SrcAcc | Mov | MemAbs), - D2bv(SrcSI | DstDI | Mov | String), D2bv(SrcSI | DstDI | String), + I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), + I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), + I2bv(SrcSI | DstDI | Mov | String, em_mov), + D2bv(SrcSI | DstDI | String), /* 0xA8 - 0xAF */ D2bv(DstAcc | SrcImm), - D2bv(SrcAcc | DstDI | Mov | String), - D2bv(SrcSI | DstAcc | Mov | String), + I2bv(SrcAcc | DstDI | Mov | String, em_mov), + I2bv(SrcSI | DstAcc | Mov | String, em_mov), D2bv(SrcAcc | DstDI | String), /* 0xB0 - 0xB7 */ - X8(D(ByteOp | DstReg | SrcImm | Mov)), + X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ - X8(D(DstReg | SrcImm | Mov)), + X8(I(DstReg | SrcImm | Mov, em_mov)), /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), D(ImplicitOps | Stack), D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), - D2bv(DstMem | SrcImm | ModRM | Mov), + I2bv(DstMem | SrcImm | ModRM | Mov, em_mov), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), @@ -3212,8 +3220,6 @@ special_insn: c->dst.val = c->src.orig_val; c->lock_prefix = 1; break; - case 0x88 ... 0x8b: /* mov */ - goto mov; case 0x8c: /* mov r/m, sreg */ if (c->modrm_reg > VCPU_SREG_GS) { emulate_ud(ctxt); @@ -3271,22 +3277,14 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; - case 0xa0 ... 0xa3: /* mov */ - case 0xa4 ... 0xa5: /* movs */ - goto mov; case 0xa6 ... 0xa7: /* cmps */ c->dst.type = OP_NONE; /* Disable writeback. */ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xaa ... 0xab: /* stos */ - case 0xac ... 0xad: /* lods */ - goto mov; case 0xae ... 0xaf: /* scas */ goto cmp; - case 0xb0 ... 0xbf: /* mov r, imm */ - goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; @@ -3305,10 +3303,6 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; - break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); if (rc != X86EMUL_CONTINUE) -- cgit v1.2.3 From a4d4a7c1880db98a521bc27c15348185fa30c256 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 3 Aug 2010 15:05:46 +0300 Subject: KVM: x86 emulator: fix group 11 decoding for reg != 0 These are all undefined. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c0715ae05a54..9940d1661544 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2449,6 +2449,10 @@ static struct group_dual group9 = { { N, N, N, N, N, N, N, N, } }; +static struct opcode group11[] = { + I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ D6ALU(Lock), @@ -2525,7 +2529,7 @@ static struct opcode opcode_table[256] = { I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), D(ImplicitOps | Stack), D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), - I2bv(DstMem | SrcImm | ModRM | Mov, em_mov), + G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), -- cgit v1.2.3 From 7d9ddaedd8a9d0442fda5b5a90f22a33becbd235 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Aug 2010 17:12:28 +0300 Subject: KVM: x86 emulator: clean up control flow in x86_emulate_insn() x86_emulate_insn() is full of things like if (rc != X86EMUL_CONTINUE) goto done; break; consolidate all of those at the end of the switch statement. Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 64 ++++++-------------------------------------------- 1 file changed, 7 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9940d1661544..27d2c22b114e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3098,8 +3098,6 @@ special_insn: break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x08 ... 0x0d: or: /* or */ @@ -3117,8 +3115,6 @@ special_insn: break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x18 ... 0x1d: sbb: /* sbb */ @@ -3129,8 +3125,6 @@ special_insn: break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x20 ... 0x25: and: /* and */ @@ -3157,18 +3151,12 @@ special_insn: case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x60: /* pusha */ rc = emulate_pusha(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -3255,8 +3243,6 @@ special_insn: } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) @@ -3278,8 +3264,6 @@ special_insn: c->dst.addr.reg = &ctxt->eflags; c->dst.bytes = c->op_bytes; rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xa6 ... 0xa7: /* cmps */ c->dst.type = OP_NONE; /* Disable writeback. */ @@ -3299,18 +3283,12 @@ special_insn: goto pop_instruction; case 0xc4: /* les */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xc5: /* lds */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xcc: /* int3 */ irq = 3; @@ -3319,8 +3297,6 @@ special_insn: irq = c->src.val; do_interrupt: rc = emulate_int(ctxt, ops, irq); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xce: /* into */ if (ctxt->eflags & EFLG_OF) { @@ -3330,9 +3306,6 @@ special_insn: break; case 0xcf: /* iret */ rc = emulate_iret(ctxt, ops); - - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xd0 ... 0xd1: /* Grp2 */ emulate_grp2(ctxt); @@ -3419,8 +3392,6 @@ special_insn: break; case 0xf6 ... 0xf7: /* Grp3 */ rc = emulate_grp3(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; @@ -3453,8 +3424,6 @@ special_insn: case 0xfe: /* Grp4 */ grp45: rc = emulate_grp45(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xff: /* Grp5 */ if (c->modrm_reg == 5) @@ -3464,6 +3433,9 @@ special_insn: goto cannot_emulate; } + if (rc != X86EMUL_CONTINUE) + goto done; + writeback: rc = writeback(ctxt, ops); if (rc != X86EMUL_CONTINUE) @@ -3545,8 +3517,6 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; break; default: goto cannot_emulate; @@ -3585,10 +3555,6 @@ twobyte_insn: break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x06: emulate_clts(ctxt->vcpu); @@ -3665,17 +3631,9 @@ twobyte_insn: break; case 0x34: /* sysenter */ rc = emulate_sysenter(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x35: /* sysexit */ rc = emulate_sysexit(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; @@ -3694,8 +3652,6 @@ twobyte_insn: break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xa3: bt: /* bt */ @@ -3713,8 +3669,6 @@ twobyte_insn: break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xab: bts: /* bts */ @@ -3745,8 +3699,6 @@ twobyte_insn: break; case 0xb2: /* lss */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xb3: btr: /* btr */ @@ -3754,13 +3706,9 @@ twobyte_insn: break; case 0xb4: /* lfs */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xb5: /* lgs */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xb6 ... 0xb7: /* movzx */ c->dst.bytes = c->op_bytes; @@ -3825,12 +3773,14 @@ twobyte_insn: break; case 0xc7: /* Grp9 (cmpxchg8b) */ rc = emulate_grp9(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; default: goto cannot_emulate; } + + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; cannot_emulate: -- cgit v1.2.3 From 9ed049c3b6230b68985da31f8243d4bec95e0b3a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 Aug 2010 12:18:24 +0300 Subject: KVM: i8259: Make ICW1 conform to spec ICW is not a full reset, instead it resets a limited number of registers in the PIC. Change ICW1 emulation to only reset those registers. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4b7b73ce2098..6e77471951e8 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -308,13 +308,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - kvm_pic_reset(s); /* init */ - /* - * deassert a pending interrupt - */ - pic_irq_request(s->pics_state->kvm, 0); - s->init_state = 1; s->init4 = val & 1; + s->last_irr = 0; + s->imr = 0; + s->priority_add = 0; + s->special_mask = 0; + s->read_reg_select = 0; + if (!s->init4) { + s->special_fully_nested_mode = 0; + s->auto_eoi = 0; + } + s->init_state = 1; if (val & 0x02) printk(KERN_ERR "single mode not supported"); if (val & 0x08) -- cgit v1.2.3 From 84e0cefa8ddd5d5018d3b582e1e90585ed551757 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Wed, 1 Sep 2010 11:42:04 +0200 Subject: KVM: Fix guest kernel crash on MSR_K7_CLK_CTL MSR_K7_CLK_CTL is a no longer documented MSR, which is only relevant on said old AMD CPU models. This change returns the expected value, which the Linux kernel is expecting to avoid writing back the MSR, plus it ignores all writes to the MSR. Signed-off-by: Jes Sorensen Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1c972382e5d4..f47db2588a41 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1449,6 +1449,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to speicify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1674,6 +1684,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + data = 0x20000000; + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; -- cgit v1.2.3 From 8b1fe17cc7a8b2c62b400dcbfaebd96da6b4f58e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 30 Aug 2010 18:22:53 +0800 Subject: KVM: MMU: support disable/enable mmu audit dynamicly Add a r/w module parameter named 'mmu_audit', it can control audit enable/disable: enable: echo 1 > /sys/module/kvm/parameters/mmu_audit disable: echo 0 > /sys/module/kvm/parameters/mmu_audit This patch not change the logic Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/Kconfig | 7 ++++ arch/x86/kvm/mmu.c | 91 +++++++++++++++++++++++++++++++++++++--------- arch/x86/kvm/mmutrace.h | 19 ++++++++++ arch/x86/kvm/paging_tmpl.h | 4 +- 4 files changed, 101 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 970bbd479516..ddc131ff438f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -64,6 +64,13 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. +config KVM_MMU_AUDIT + bool "Audit KVM MMU" + depends on KVM && TRACEPOINTS + ---help--- + This option adds a R/W kVM module parameter 'mmu_audit', which allows + audit KVM MMU at runtime. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0bff4d54817e..8b750ff6911a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -49,15 +49,21 @@ */ bool tdp_enabled = false; -#undef MMU_DEBUG +enum { + AUDIT_PRE_PAGE_FAULT, + AUDIT_POST_PAGE_FAULT, + AUDIT_PRE_PTE_WRITE, + AUDIT_POST_PTE_WRITE +}; -#undef AUDIT +char *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write" +}; -#ifdef AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} -#endif +#undef MMU_DEBUG #ifdef MMU_DEBUG @@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} #endif -#if defined(MMU_DEBUG) || defined(AUDIT) +#ifdef MMU_DEBUG static int dbg = 0; module_param(dbg, bool, 0644); #endif @@ -2964,7 +2970,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, "pre pte write"); + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { @@ -3037,7 +3043,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - kvm_mmu_audit(vcpu, "post pte write"); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); @@ -3483,8 +3489,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) } EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); -#ifdef AUDIT - +#ifdef CONFIG_KVM_MMU_AUDIT static const char *audit_msg; typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); @@ -3699,18 +3704,68 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) } } -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) { - int olddbg = dbg; - - dbg = 0; - audit_msg = msg; + audit_msg = audit_point_name[audit_point]; audit_rmap(vcpu); audit_write_protection(vcpu); if (strcmp("pre pte write", audit_msg) != 0) audit_mappings(vcpu); audit_sptes_have_rmaps(vcpu); - dbg = olddbg; } +static bool mmu_audit; + +static void mmu_audit_enable(void) +{ + int ret; + + if (mmu_audit) + return; + + ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + WARN_ON(ret); + + mmu_audit = true; +} + +static void mmu_audit_disable(void) +{ + if (!mmu_audit) + return; + + unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + tracepoint_synchronize_unregister(); + mmu_audit = false; +} + +static int mmu_audit_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned long enable; + + ret = strict_strtoul(val, 10, &enable); + if (ret < 0) + return -EINVAL; + + switch (enable) { + case 0: + mmu_audit_disable(); + break; + case 1: + mmu_audit_enable(); + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct kernel_param_ops audit_param_ops = { + .set = mmu_audit_set, + .get = param_get_bool, +}; + +module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); #endif diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3aab0f0930ef..b60b4fdb3eda 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); + +TRACE_EVENT( + kvm_mmu_audit, + TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), + TP_ARGS(vcpu, audit_point), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(int, audit_point) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->audit_point = audit_point; + ), + + TP_printk("vcpu:%d %s", __entry->vcpu->cpu, + audit_point_name[__entry->audit_point]) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a0f2febf5692..debe77035366 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; - kvm_mmu_audit(vcpu, "pre page fault"); + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); @@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, "post page fault (fixed)"); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; -- cgit v1.2.3 From 2f4f337248cd5660040b7e09b7287a7a0a861f3f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 30 Aug 2010 18:24:10 +0800 Subject: KVM: MMU: move audit to a separate file Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 279 +------------------------------------------- arch/x86/kvm/mmu_audit.c | 297 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 298 insertions(+), 278 deletions(-) create mode 100644 arch/x86/kvm/mmu_audit.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8b750ff6911a..d2dad65a45f8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); #ifdef CONFIG_KVM_MMU_AUDIT -static const char *audit_msg; - -typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); - -static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, - inspect_spte_fn fn) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = sp->spt[i]; - - if (is_shadow_present_pte(ent)) { - if (!is_last_spte(ent, sp->role.level)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(kvm, child, fn); - } else - fn(kvm, &sp->spt[i]); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); - return; - } - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root && VALID_PAGE(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); - } - } - return; -} - -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) -{ - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 *sptep = pt + i; - struct kvm_mmu_page *sp; - gfn_t gfn; - pfn_t pfn; - hpa_t hpa; - - sp = page_header(__pa(sptep)); - - if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { - printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n", - audit_msg, sp, level); - return; - } - - if (*sptep == shadow_notrap_nonpresent_pte) { - printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n", - audit_msg, sp); - return; - } - } - - if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { - printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n", - audit_msg, sp); - return; - } - - if (!is_shadow_present_pte(*sptep) || - !is_last_spte(*sptep, level)) - return; - - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - - hpa = pfn << PAGE_SHIFT; - - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx pfn %llx hpa %llx ent %llxn", - audit_msg, vcpu->arch.mmu.root_level, - va, pfn, hpa, *sptep); - } -} - -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->arch.mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); -} - -void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - unsigned long *rmapp; - struct kvm_mmu_page *rev_sp; - gfn_t gfn; - - - rev_sp = page_header(__pa(sptep)); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no memslot for gfn %llx\n", - audit_msg, gfn); - printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n", - audit_msg, (long int)(sptep - rev_sp->spt), - rev_sp->gfn); - dump_stack(); - return; - } - - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); - if (!*rmapp) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no rmap for writable spte %llx\n", - audit_msg, *sptep); - dump_stack(); - } -} - -void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, inspect_spte_has_rmap); -} - -static void check_mappings_rmap(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - int i; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - u64 *pt = sp->spt; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_rmap_spte(pt[i])) - continue; - - inspect_spte_has_rmap(vcpu->kvm, &pt[i]); - } - } - return; -} - -static void audit_rmap(struct kvm_vcpu *vcpu) -{ - check_mappings_rmap(vcpu); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - struct kvm_memory_slot *slot; - unsigned long *rmapp; - u64 *spte; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.direct) - continue; - if (sp->unsync) - continue; - if (sp->role.invalid) - continue; - - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - - spte = rmap_next(vcpu->kvm, rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) - printk(KERN_ERR "%s: (%s) shadow page has " - "writable mappings: gfn %llx role %x\n", - __func__, audit_msg, sp->gfn, - sp->role.word); - spte = rmap_next(vcpu->kvm, rmapp, spte); - } - } -} - -static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) -{ - audit_msg = audit_point_name[audit_point]; - audit_rmap(vcpu); - audit_write_protection(vcpu); - if (strcmp("pre pte write", audit_msg) != 0) - audit_mappings(vcpu); - audit_sptes_have_rmaps(vcpu); -} - -static bool mmu_audit; - -static void mmu_audit_enable(void) -{ - int ret; - - if (mmu_audit) - return; - - ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - WARN_ON(ret); - - mmu_audit = true; -} - -static void mmu_audit_disable(void) -{ - if (!mmu_audit) - return; - - unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); - tracepoint_synchronize_unregister(); - mmu_audit = false; -} - -static int mmu_audit_set(const char *val, const struct kernel_param *kp) -{ - int ret; - unsigned long enable; - - ret = strict_strtoul(val, 10, &enable); - if (ret < 0) - return -EINVAL; - - switch (enable) { - case 0: - mmu_audit_disable(); - break; - case 1: - mmu_audit_enable(); - break; - default: - return -EINVAL; - } - - return 0; -} - -static struct kernel_param_ops audit_param_ops = { - .set = mmu_audit_set, - .get = param_get_bool, -}; - -module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); +#include "mmu_audit.c" #endif diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c new file mode 100644 index 000000000000..fb8a461333c5 --- /dev/null +++ b/arch/x86/kvm/mmu_audit.c @@ -0,0 +1,297 @@ +/* + * mmu_audit.c: + * + * Audit code for KVM MMU + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. + * + * Authors: + * Yaniv Kamay + * Avi Kivity + * Marcelo Tosatti + * Xiao Guangrong + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +static const char *audit_msg; + +typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); + +static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, + inspect_spte_fn fn) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + if (!is_last_spte(ent, sp->role.level)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(kvm, child, fn); + } else + fn(kvm, &sp->spt[i]); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu->kvm, sp, fn); + } + } + return; +} + +static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, + gva_t va, int level) +{ + u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); + int i; + gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { + u64 *sptep = pt + i; + struct kvm_mmu_page *sp; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; + + sp = page_header(__pa(sptep)); + + if (sp->unsync) { + if (level != PT_PAGE_TABLE_LEVEL) { + printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n", + audit_msg, sp, level); + return; + } + + if (*sptep == shadow_notrap_nonpresent_pte) { + printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n", + audit_msg, sp); + return; + } + } + + if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { + printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n", + audit_msg, sp); + return; + } + + if (!is_shadow_present_pte(*sptep) || + !is_last_spte(*sptep, level)) + return; + + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; + } + + hpa = pfn << PAGE_SHIFT; + + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + " gva %lx pfn %llx hpa %llx ent %llxn", + audit_msg, vcpu->arch.mmu.root_level, + va, pfn, hpa, *sptep); + } +} + +static void audit_mappings(struct kvm_vcpu *vcpu) +{ + unsigned i; + + if (vcpu->arch.mmu.root_level == 4) + audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); + else + for (i = 0; i < 4; ++i) + if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) + audit_mappings_page(vcpu, + vcpu->arch.mmu.pae_root[i], + i << 30, + 2); +} + +void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + + rev_sp = page_header(__pa(sptep)); + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no memslot for gfn %llx\n", + audit_msg, gfn); + printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n", + audit_msg, (long int)(sptep - rev_sp->spt), + rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + printk(KERN_ERR "%s: no rmap for writable spte %llx\n", + audit_msg, *sptep); + dump_stack(); + } +} + +void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, inspect_spte_has_rmap); +} + +static void check_mappings_rmap(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + int i; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + u64 *pt = sp->spt; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_rmap_spte(pt[i])) + continue; + + inspect_spte_has_rmap(vcpu->kvm, &pt[i]); + } + } + return; +} + +static void audit_rmap(struct kvm_vcpu *vcpu) +{ + check_mappings_rmap(vcpu); +} + +static void audit_write_protection(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + struct kvm_memory_slot *slot; + unsigned long *rmapp; + u64 *spte; + + list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { + if (sp->role.direct) + continue; + if (sp->unsync) + continue; + if (sp->role.invalid) + continue; + + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); + rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + + spte = rmap_next(vcpu->kvm, rmapp, NULL); + while (spte) { + if (is_writable_pte(*spte)) + printk(KERN_ERR "%s: (%s) shadow page has " + "writable mappings: gfn %llx role %x\n", + __func__, audit_msg, sp->gfn, + sp->role.word); + spte = rmap_next(vcpu->kvm, rmapp, spte); + } + } +} + +static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) +{ + audit_msg = audit_point_name[audit_point]; + audit_rmap(vcpu); + audit_write_protection(vcpu); + if (strcmp("pre pte write", audit_msg) != 0) + audit_mappings(vcpu); + audit_sptes_have_rmaps(vcpu); +} + +static bool mmu_audit; + +static void mmu_audit_enable(void) +{ + int ret; + + if (mmu_audit) + return; + + ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + WARN_ON(ret); + + mmu_audit = true; +} + +static void mmu_audit_disable(void) +{ + if (!mmu_audit) + return; + + unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + tracepoint_synchronize_unregister(); + mmu_audit = false; +} + +static int mmu_audit_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned long enable; + + ret = strict_strtoul(val, 10, &enable); + if (ret < 0) + return -EINVAL; + + switch (enable) { + case 0: + mmu_audit_disable(); + break; + case 1: + mmu_audit_enable(); + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct kernel_param_ops audit_param_ops = { + .set = mmu_audit_set, + .get = param_get_bool, +}; + +module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); -- cgit v1.2.3 From 49edf87806f52a005152beaed9f4731862efc8fe Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 30 Aug 2010 18:25:03 +0800 Subject: KVM: MMU: improve active sp audit Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do these checking in a sp walking Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu_audit.c | 74 +++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fb8a461333c5..8becb86cd348 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -65,6 +65,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) return; } +typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); + +static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) + fn(kvm, sp); +} + static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, gva_t va, int level) { @@ -175,67 +185,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu) mmu_spte_walk(vcpu, inspect_spte_has_rmap); } -static void check_mappings_rmap(struct kvm_vcpu *vcpu) +static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) { - struct kvm_mmu_page *sp; int i; - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - u64 *pt = sp->spt; + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + return; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_rmap_spte(sp->spt[i])) continue; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_rmap_spte(pt[i])) - continue; - - inspect_spte_has_rmap(vcpu->kvm, &pt[i]); - } + inspect_spte_has_rmap(kvm, sp->spt + i); } - return; } -static void audit_rmap(struct kvm_vcpu *vcpu) +void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { - check_mappings_rmap(vcpu); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; struct kvm_memory_slot *slot; unsigned long *rmapp; u64 *spte; - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.direct) - continue; - if (sp->unsync) - continue; - if (sp->role.invalid) - continue; + if (sp->role.direct || sp->unsync || sp->role.invalid) + return; - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + slot = gfn_to_memslot(kvm, sp->gfn); + rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(vcpu->kvm, rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) - printk(KERN_ERR "%s: (%s) shadow page has " + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + if (is_writable_pte(*spte)) + printk(KERN_ERR "%s: (%s) shadow page has " "writable mappings: gfn %llx role %x\n", __func__, audit_msg, sp->gfn, sp->role.word); - spte = rmap_next(vcpu->kvm, rmapp, spte); - } + spte = rmap_next(kvm, rmapp, spte); } } +static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + check_mappings_rmap(kvm, sp); + audit_write_protection(kvm, sp); +} + +static void audit_all_active_sps(struct kvm *kvm) +{ + walk_all_active_sps(kvm, audit_sp); +} + static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) { audit_msg = audit_point_name[audit_point]; - audit_rmap(vcpu); - audit_write_protection(vcpu); + audit_all_active_sps(vcpu->kvm); if (strcmp("pre pte write", audit_msg) != 0) audit_mappings(vcpu); audit_sptes_have_rmaps(vcpu); -- cgit v1.2.3 From eb2591865a234c6fb1162085d9b277236fa890b6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 30 Aug 2010 18:25:51 +0800 Subject: KVM: MMU: improve spte audit Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do these checking in a spte walking Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu_audit.c | 148 ++++++++++++++++++++++------------------------- 1 file changed, 69 insertions(+), 79 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 8becb86cd348..3bde186409bf 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -19,23 +19,24 @@ static const char *audit_msg; -typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); +typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); -static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, - inspect_spte_fn fn) +static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + inspect_spte_fn fn, int level) { int i; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = sp->spt[i]; - - if (is_shadow_present_pte(ent)) { - if (!is_last_spte(ent, sp->role.level)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(kvm, child, fn); - } else - fn(kvm, &sp->spt[i]); + u64 *ent = sp->spt; + + fn(vcpu, ent + i, level); + + if (is_shadow_present_pte(ent[i]) && + !is_last_spte(ent[i], level)) { + struct kvm_mmu_page *child; + + child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(vcpu, child, fn, level - 1); } } } @@ -47,21 +48,25 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); + __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); return; } + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); + __mmu_spte_walk(vcpu, sp, fn, 2); } } + return; } @@ -75,80 +80,55 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) fn(kvm, sp); } -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) +static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) { - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 *sptep = pt + i; - struct kvm_mmu_page *sp; - gfn_t gfn; - pfn_t pfn; - hpa_t hpa; - - sp = page_header(__pa(sptep)); - - if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { - printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n", - audit_msg, sp, level); - return; - } - - if (*sptep == shadow_notrap_nonpresent_pte) { - printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n", - audit_msg, sp); - return; - } - } + struct kvm_mmu_page *sp; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; - if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { - printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n", - audit_msg, sp); + sp = page_header(__pa(sptep)); + + if (sp->unsync) { + if (level != PT_PAGE_TABLE_LEVEL) { + printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n", + audit_msg, sp, level); return; } - if (!is_shadow_present_pte(*sptep) || - !is_last_spte(*sptep, level)) + if (*sptep == shadow_notrap_nonpresent_pte) { + printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n", + audit_msg, sp); return; + } + } - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { + printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n", + audit_msg, sp); + return; + } - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } + if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) + return; - hpa = pfn << PAGE_SHIFT; + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); - if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx pfn %llx hpa %llx ent %llxn", - audit_msg, vcpu->arch.mmu.root_level, - va, pfn, hpa, *sptep); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; } -} -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->arch.mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); + hpa = pfn << PAGE_SHIFT; + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + printk(KERN_ERR "xx audit error: (%s) levels %d" + "pfn %llx hpa %llx ent %llxn", + audit_msg, vcpu->arch.mmu.root_level, + pfn, hpa, *sptep); } -void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) +static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { unsigned long *rmapp; struct kvm_mmu_page *rev_sp; @@ -180,9 +160,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) } } -void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu) +static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) { - mmu_spte_walk(vcpu, inspect_spte_has_rmap); + if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) + inspect_spte_has_rmap(vcpu->kvm, sptep); } static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -234,13 +215,22 @@ static void audit_all_active_sps(struct kvm *kvm) walk_all_active_sps(kvm, audit_sp); } +static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + audit_sptes_have_rmaps(vcpu, sptep, level); + audit_mappings(vcpu, sptep, level); +} + +static void audit_vcpu_spte(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, audit_spte); +} + static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) { audit_msg = audit_point_name[audit_point]; audit_all_active_sps(vcpu->kvm); - if (strcmp("pre pte write", audit_msg) != 0) - audit_mappings(vcpu); - audit_sptes_have_rmaps(vcpu); + audit_vcpu_spte(vcpu); } static bool mmu_audit; -- cgit v1.2.3 From 30644b902c5eef5328d37a2e15f1921aaca2588b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 30 Aug 2010 18:26:33 +0800 Subject: KVM: MMU: lower the aduit frequency The audit is very high overhead, so we need lower the frequency to assure the guest is running. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu_audit.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 3bde186409bf..bd2b1be7066e 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -17,6 +17,8 @@ * */ +#include + static const char *audit_msg; typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); @@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) { + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!__ratelimit(&ratelimit_state)) + return; + audit_msg = audit_point_name[audit_point]; audit_all_active_sps(vcpu->kvm); audit_vcpu_spte(vcpu); -- cgit v1.2.3 From f87f928882d080eaec8b0d76aecff003d664697d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Sep 2010 17:29:45 +0200 Subject: KVM: MMU: Fix 32 bit legacy paging with NPT This patch fixes 32 bit legacy paging with NPT enabled. The mmu_check_root call on the top-level of the loop causes root_gfn to take values (in the tdp_enabled path) which are outside of guest memory. So the mmu_check_root call fails at some point in the loop interation causing the guest to tiple-fault. This patch changes the mmu_check_root calls to the places where they are really necessary. As a side-effect it introduces a check for the root of a pae page table too. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d2dad65a45f8..b2136f921d7e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2387,6 +2387,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return 0; } direct = !is_paging(vcpu); + + if (mmu_check_root(vcpu, root_gfn)) + return 1; + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2398,10 +2402,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) continue; } root_gfn = pdptr >> PAGE_SHIFT; + if (mmu_check_root(vcpu, root_gfn)) + return 1; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; - if (mmu_check_root(vcpu, root_gfn)) - return 1; if (tdp_enabled) { direct = 1; root_gfn = i << 30; -- cgit v1.2.3 From cda0008299a06f0d7218c6037c3c02d7a865e954 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 2 Sep 2010 17:29:46 +0200 Subject: KVM: SVM: Restore correct registers after sel_cr0 intercept emulation This patch implements restoring of the correct rip, rsp, and rax after the svm emulation in KVM injected a selective_cr0 write intercept into the guest hypervisor. The problem was that the vmexit is emulated in the instruction emulation which later commits the registers right after the write-cr0 instruction. So the l1 guest will continue to run with the l2 rip, rsp and rax resulting in unpredictable behavior. This patch is not the final word, it is just an easy patch to fix the issue. The real fix will be done when the instruction emulator is made aware of nested virtualization. Until this is done this patch fixes the issue and provides an easy way to fix this in -stable too. Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index a1a83b955ed7..07655345f50b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -88,6 +88,14 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; + /* + * If we vmexit during an instruction emulation we need this to restore + * the l1 guest rip after the emulation + */ + unsigned long vmexit_rip; + unsigned long vmexit_rsp; + unsigned long vmexit_rax; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -1213,8 +1221,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (old == new) { /* cr0 write with ts and mp unchanged */ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { + svm->nested.vmexit_rip = kvm_rip_read(vcpu); + svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); return; + } } } @@ -2430,6 +2442,23 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } +static int cr0_write_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int r; + + r = emulate_instruction(&svm->vcpu, 0, 0, 0); + + if (svm->nested.vmexit_rip) { + kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); + kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); + kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); + svm->nested.vmexit_rip = 0; + } + + return r == EMULATE_DONE; +} + static int cr8_write_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; @@ -2692,7 +2721,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = cr0_write_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, -- cgit v1.2.3 From b75f4eb34122b60ee4f07ec89973d1589002c68a Mon Sep 17 00:00:00 2001 From: "Roedel, Joerg" Date: Fri, 3 Sep 2010 14:21:40 +0200 Subject: KVM: SVM: Clean up rip handling in vmrun emulation This patch changes the rip handling in the vmrun emulation path from using next_rip to the generic kvm register access functions. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 07655345f50b..fcbc491e1f87 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2069,7 +2069,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) return false; } - trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, nested_vmcb->control.event_inj, @@ -2098,7 +2098,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; - hsave->save.rip = svm->next_rip; + hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; if (npt_enabled) @@ -2270,8 +2270,8 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + /* Save rip after vmrun instruction */ + kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); if (!nested_svm_vmrun(svm)) return 1; -- cgit v1.2.3 From b9a52c4b78ec254ee00cce47d75efd89b09f13dd Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Thu, 9 Sep 2010 12:06:45 +0200 Subject: x86: Define MSR_EBC_FREQUENCY_ID Signed-off-by: Jes Sorensen Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 986f7790fdb2..83c4bb1d917d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -198,6 +198,7 @@ #define MSR_IA32_TSC 0x00000010 #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a +#define MSR_EBC_FREQUENCY_ID 0x0000002c #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define FEATURE_CONTROL_LOCKED (1<<0) -- cgit v1.2.3 From 7b91409822ed37f2a58974e49498bdbe92ddd93c Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Thu, 9 Sep 2010 12:06:46 +0200 Subject: KVM: x86: Emulate MSR_EBC_FREQUENCY_ID Some operating systems store data about the host processor at the time of installation, and when booted on a more uptodate cpu tries to read MSR_EBC_FREQUENCY_ID. This has been found with XP. Signed-off-by: Jes Sorensen Reviewed-by: Juan Quintela Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f47db2588a41..9d434777154d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1651,6 +1651,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case 0xcd: /* fsb frequency */ data = 3; break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + data = 1 << 24; + break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; -- cgit v1.2.3 From 957446afce22df9a42b9482fcd55985f4037fe66 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:38 +0200 Subject: KVM: MMU: Check for root_level instead of long mode The walk_addr function checks for !is_long_mode in its 64 bit version. But what is meant here is a check for pae paging. Change the condition to really check for pae paging so that it also works with nested nested paging. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index debe77035366..e4ad3dc84df3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -132,7 +132,7 @@ walk: walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 - if (!is_long_mode(vcpu)) { + if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) { @@ -205,7 +205,7 @@ walk: (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && is_large_pte(pte) && - is_long_mode(vcpu))) { + vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL)) { int lvl = walker->level; walker->gfn = gpte_to_gfn_lvl(pte, lvl); -- cgit v1.2.3 From c5a78f2b649ae75ae788e7622ca5a586af2cb35a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:39 +0200 Subject: KVM: MMU: Make tdp_enabled a mmu-context parameter This patch changes the tdp_enabled flag from its global meaning to the mmu-context and renames it to direct_map there. This is necessary for Nested SVM with emulation of Nested Paging where we need an extra MMU context to shadow the Nested Nested Page Table. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 789e9462668f..80ef28bddcc3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -249,6 +249,7 @@ struct kvm_mmu { int root_level; int shadow_root_level; union kvm_mmu_page_role base_role; + bool direct_map; u64 *pae_root; u64 rsvd_bits_mask[2][4]; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b2136f921d7e..5c28e979d730 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1448,7 +1448,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!vcpu->arch.mmu.direct_map + && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -1973,7 +1974,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_user_mask; if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; - if (tdp_enabled) + if (vcpu->arch.mmu.direct_map) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); @@ -1983,8 +1984,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (!tdp_enabled && write_fault && !is_write_protection(vcpu) - && !user_fault)) { + || (!vcpu->arch.mmu.direct_map && write_fault + && !is_write_protection(vcpu) && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { @@ -1995,7 +1996,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_WRITABLE_MASK; - if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) + if (!vcpu->arch.mmu.direct_map + && !(pte_access & ACC_WRITE_MASK)) spte &= ~PT_USER_MASK; /* @@ -2371,7 +2373,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (mmu_check_root(vcpu, root_gfn)) return 1; - if (tdp_enabled) { + if (vcpu->arch.mmu.direct_map) { direct = 1; root_gfn = 0; } @@ -2406,7 +2408,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return 1; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; - if (tdp_enabled) { + if (vcpu->arch.mmu.direct_map) { direct = 1; root_gfn = i << 30; } @@ -2544,6 +2546,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->direct_map = true; return 0; } @@ -2663,6 +2666,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; + context->direct_map = false; return 0; } @@ -2687,6 +2691,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->direct_map = false; return 0; } @@ -2708,6 +2713,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; + context->direct_map = true; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -3060,7 +3066,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (tdp_enabled) + if (vcpu->arch.mmu.direct_map) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); -- cgit v1.2.3 From f43addd46168110d572dcf69100cb215a4e9fd08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:40 +0200 Subject: KVM: MMU: Make set_cr3 a function pointer in kvm_mmu This is necessary to implement Nested Nested Paging. As a side effect this allows some cleanups in the SVM nested paging code. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 80ef28bddcc3..53cedede88fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -236,6 +236,7 @@ struct kvm_pio_request { */ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c28e979d730..c8acb9609ca4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2714,6 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; context->direct_map = true; + context->set_cr3 = kvm_x86_ops->set_cr3; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2752,7 +2753,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) r = paging32_init_context(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.set_cr3 = kvm_x86_ops->set_cr3; return r; } @@ -2796,7 +2798,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); out: return r; } -- cgit v1.2.3 From 1c97f0a04c74196880f22a563134c8f6d0b9d752 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:41 +0200 Subject: KVM: X86: Introduce a tdp_set_cr3 function This patch introduces a special set_tdp_cr3 function pointer in kvm_x86_ops which is only used for tpd enabled mmu contexts. This allows to remove some hacks from svm code. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/svm.c | 23 ++++++++++++++--------- arch/x86/kvm/vmx.c | 2 ++ 4 files changed, 19 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 53cedede88fa..81a51473f745 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -526,6 +526,8 @@ struct kvm_x86_ops { bool (*rdtscp_supported)(void); void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); bool (*has_wbinvd_exit)(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c8acb9609ca4..a55f8d5a7985 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2714,7 +2714,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; context->direct_map = true; - context->set_cr3 = kvm_x86_ops->set_cr3; + context->set_cr3 = kvm_x86_ops->set_tdp_cr3; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fcbc491e1f87..53c9039583fd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3216,9 +3216,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; - /* required for live migration with NPT */ - if (npt_enabled) - svm->vmcb->save.cr3 = vcpu->arch.cr3; clgi(); @@ -3340,16 +3337,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - if (npt_enabled) { - svm->vmcb->control.nested_cr3 = root; - force_new_asid(vcpu); - return; - } - svm->vmcb->save.cr3 = root; force_new_asid(vcpu); } +static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + + /* Also sync guest cr3 here in case we live migrate */ + svm->vmcb->save.cr3 = vcpu->arch.cr3; + + force_new_asid(vcpu); +} + static int is_disabled(void) { u64 vm_cr; @@ -3576,6 +3579,8 @@ static struct kvm_x86_ops svm_x86_ops = { .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, + + .set_tdp_cr3 = set_tdp_cr3, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 275a81d571cf..ff7a8d48fd24 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4341,6 +4341,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, + + .set_tdp_cr3 = vmx_set_cr3, }; static int __init vmx_init(void) -- cgit v1.2.3 From 5777ed340d89cdc6c76a5c552337a3861b40a806 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:42 +0200 Subject: KVM: MMU: Introduce get_cr3 function pointer This function pointer in the MMU context is required to implement Nested Nested Paging. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 9 ++++++++- arch/x86/kvm/paging_tmpl.h | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 81a51473f745..6c97b8debfa8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -237,6 +237,7 @@ struct kvm_pio_request { struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); + unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a55f8d5a7985..e4a7de4c8c77 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2365,7 +2365,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) int direct = 0; u64 pdptr; - root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; @@ -2562,6 +2562,11 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } +static unsigned long get_cr3(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr3; +} + static void inject_page_fault(struct kvm_vcpu *vcpu, u64 addr, u32 err_code) @@ -2715,6 +2720,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_hpa = INVALID_PAGE; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; + context->get_cr3 = get_cr3; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2755,6 +2761,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.set_cr3 = kvm_x86_ops->set_cr3; + vcpu->arch.mmu.get_cr3 = get_cr3; return r; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e4ad3dc84df3..13d0c06b1bc8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -130,7 +130,7 @@ walk: present = true; eperm = rsvd_fault = false; walker->level = vcpu->arch.mmu.root_level; - pte = vcpu->arch.cr3; + pte = vcpu->arch.mmu.get_cr3(vcpu); #if PTTYPE == 64 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); @@ -143,7 +143,7 @@ walk: } #endif ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); pt_access = ACC_ALL; -- cgit v1.2.3 From cb659db8a7d1ed558898f533a957dfc342f9499d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:43 +0200 Subject: KVM: MMU: Introduce inject_page_fault function pointer This patch introduces an inject_page_fault function pointer into struct kvm_mmu which will be used to inject a page fault. This will be used later when Nested Nested Paging is implemented. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/mmu.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6c97b8debfa8..009a4a1b370e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -239,6 +239,9 @@ struct kvm_mmu { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + unsigned long addr, + u32 error_code); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, u32 *error); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e4a7de4c8c77..a751dfc8526d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2571,7 +2571,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, u64 addr, u32 err_code) { - kvm_inject_page_fault(vcpu, addr, err_code); + vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code); } static void paging_free(struct kvm_vcpu *vcpu) @@ -2721,6 +2721,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; + context->inject_page_fault = kvm_inject_page_fault; if (!is_paging(vcpu)) { context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2762,6 +2763,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.mmu.get_cr3 = get_cr3; + vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault; return r; } -- cgit v1.2.3 From 52fde8df7dd13d90f5f8dc43157418bff968d90a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:44 +0200 Subject: KVM: MMU: Introduce kvm_init_shadow_mmu helper function Some logic of the init_kvm_softmmu function is required to build the Nested Nested Paging context. So factor the required logic into a seperate function and export it. Also make the whole init path suitable for more than one mmu context. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 60 +++++++++++++++++++++++++++++++----------------------- arch/x86/kvm/mmu.h | 1 + 2 files changed, 36 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a751dfc8526d..9e48a774fceb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2532,10 +2532,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static int nonpaging_init_context(struct kvm_vcpu *vcpu) +static int nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - struct kvm_mmu *context = &vcpu->arch.mmu; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2595,9 +2594,10 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { - struct kvm_mmu *context = &vcpu->arch.mmu; int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; @@ -2656,9 +2656,11 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) } } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +static int paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { - struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, context, level); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -2675,17 +2677,17 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu) +static int paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); - return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); + return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu) +static int paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - struct kvm_mmu *context = &vcpu->arch.mmu; + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2700,10 +2702,10 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu) +static int paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); - return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); + return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) @@ -2727,15 +2729,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { - reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { - reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { - reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -2743,24 +2745,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) return 0; } -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { int r; - ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu); + r = paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu); + r = paging32_init_context(vcpu, context); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + vcpu->arch.mmu.set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.mmu.get_cr3 = get_cr3; vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index f05a03dfba4e..7086ca85d3e7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -49,6 +49,7 @@ #define PFERR_FETCH_MASK (1U << 4) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { -- cgit v1.2.3 From 3241f22da85d26505b39f525a88f52ebd1235975 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:45 +0200 Subject: KVM: MMU: Let is_rsvd_bits_set take mmu context instead of vcpu This patch changes is_rsvd_bits_set() function prototype to take only a kvm_mmu context instead of a full vcpu. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 +++--- arch/x86/kvm/paging_tmpl.h | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9e48a774fceb..86f7557cf3fb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2578,12 +2578,12 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) { int bit7; bit7 = (gpte >> 7) & 1; - return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } #define PTTYPE 64 @@ -2859,7 +2859,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } - if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) return; ++vcpu->kvm->stat.mmu_pte_updated; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 13d0c06b1bc8..68ee1b7fa89f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -168,7 +168,7 @@ walk: break; } - if (is_rsvd_bits_set(vcpu, pte, walker->level)) { + if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { rsvd_fault = true; break; } @@ -327,6 +327,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *sptep) { struct kvm_mmu_page *sp; + struct kvm_mmu *mmu = &vcpu->arch.mmu; pt_element_t *gptep = gw->prefetch_ptes; u64 *spte; int i; @@ -358,7 +359,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, gpte = gptep[i]; if (!is_present_gpte(gpte) || - is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) { + is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) { if (!sp->unsync) __set_spte(spte, shadow_notrap_nonpresent_pte); continue; @@ -713,7 +714,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL) || gfn != sp->gfns[i] || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; -- cgit v1.2.3 From 8df25a328a6ca3bd0f048278f4d5ae0a1f6fadc1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:46 +0200 Subject: KVM: MMU: Track page fault data in struct vcpu This patch introduces a struct with two new fields in vcpu_arch for x86: * fault.address * fault.error_code This will be used to correctly propagate page faults back into the guest when we could have either an ordinary page fault or a nested page fault. In the case of a nested page fault the fault-address is different from the original address that should be walked. So we need to keep track about the real fault-address. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 - arch/x86/include/asm/kvm_host.h | 17 ++++++++++++----- arch/x86/kvm/emulate.c | 30 ++++++++++++++---------------- arch/x86/kvm/mmu.c | 6 ++---- arch/x86/kvm/paging_tmpl.h | 6 +++++- arch/x86/kvm/x86.c | 9 +++++---- 6 files changed, 38 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1bf11400ae99..5187dd88019b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -229,7 +229,6 @@ struct x86_emulate_ctxt { int exception; /* exception that happens during emulation or -1 */ u32 error_code; /* error code for exception */ bool error_code_valid; - unsigned long cr2; /* faulted address in case of #PF */ /* decode cache */ struct decode_cache decode; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 009a4a1b370e..3fde5b322534 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -239,9 +239,7 @@ struct kvm_mmu { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); - void (*inject_page_fault)(struct kvm_vcpu *vcpu, - unsigned long addr, - u32 error_code); + void (*inject_page_fault)(struct kvm_vcpu *vcpu); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, u32 *error); @@ -288,6 +286,16 @@ struct kvm_vcpu_arch { bool tpr_access_reporting; struct kvm_mmu mmu; + + /* + * This struct is filled with the necessary information to propagate a + * page fault into the guest + */ + struct { + u64 address; + unsigned error_code; + } fault; + /* only needed in kvm_pv_mmu_op() path, but it's hot so * put it here to avoid allocation */ struct kvm_pv_mmu_op_buffer mmu_op_buffer; @@ -624,8 +632,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, - u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 27d2c22b114e..2b08b78b6cab 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -487,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) emulate_exception(ctxt, GP_VECTOR, err, true); } -static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, - int err) +static void emulate_pf(struct x86_emulate_ctxt *ctxt) { - ctxt->cr2 = addr; - emulate_exception(ctxt, PF_VECTOR, err, true); + emulate_exception(ctxt, PF_VECTOR, 0, true); } static void emulate_ud(struct x86_emulate_ctxt *ctxt) @@ -834,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -921,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, addr = dt.address + index * 8; ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); return ret; } @@ -947,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, addr = dt.address + index * 8; ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); return ret; } @@ -1117,7 +1115,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, c->dst.addr.mem, err); + emulate_pf(ctxt); if (rc != X86EMUL_CONTINUE) return rc; break; @@ -1939,7 +1937,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -1949,7 +1947,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -1957,7 +1955,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -1970,7 +1968,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } } @@ -2081,7 +2079,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2091,7 +2089,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2099,7 +2097,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2112,7 +2110,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 86f7557cf3fb..99367274b97c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2566,11 +2566,9 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) return vcpu->arch.cr3; } -static void inject_page_fault(struct kvm_vcpu *vcpu, - u64 addr, - u32 err_code) +static void inject_page_fault(struct kvm_vcpu *vcpu) { - vcpu->arch.mmu.inject_page_fault(vcpu, addr, err_code); + vcpu->arch.mmu.inject_page_fault(vcpu); } static void paging_free(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 68ee1b7fa89f..d07f48a06f09 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -258,6 +258,10 @@ error: walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; + + vcpu->arch.fault.address = addr; + vcpu->arch.fault.error_code = walker->error_code; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } @@ -521,7 +525,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - inject_page_fault(vcpu, addr, walker.error_code); + inject_page_fault(vcpu); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d434777154d..48b74d2fbfb7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -329,11 +329,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - u32 error_code) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu) { + unsigned error_code = vcpu->arch.fault.error_code; + ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = addr; + vcpu->arch.cr2 = vcpu->arch.fault.address; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -4080,7 +4081,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; if (ctxt->exception == PF_VECTOR) - kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); + kvm_inject_page_fault(vcpu); else if (ctxt->error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); else -- cgit v1.2.3 From 1e301feb079e8ee6091bb75283e960fc33059a68 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:47 +0200 Subject: KVM: MMU: Introduce generic walk_addr function This is the first patch in the series towards a generic walk_addr implementation which could walk two-dimensional page tables in the end. In this first step the walk_addr function is renamed into walk_addr_generic which takes a mmu context as an additional parameter. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index d07f48a06f09..a704a8130e44 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -114,9 +114,10 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) /* * Fetch a guest pte for a guest virtual address */ -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) +static int FNAME(walk_addr_generic)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t addr, int write_fault, + int user_fault, int fetch_fault) { pt_element_t pte; gfn_t table_gfn; @@ -129,10 +130,11 @@ static int FNAME(walk_addr)(struct guest_walker *walker, walk: present = true; eperm = rsvd_fault = false; - walker->level = vcpu->arch.mmu.root_level; - pte = vcpu->arch.mmu.get_cr3(vcpu); + walker->level = mmu->root_level; + pte = mmu->get_cr3(vcpu); + #if PTTYPE == 64 - if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { + if (walker->level == PT32E_ROOT_LEVEL) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) { @@ -143,7 +145,7 @@ walk: } #endif ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->arch.mmu.get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); pt_access = ACC_ALL; @@ -205,7 +207,7 @@ walk: (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && is_large_pte(pte) && - vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL)) { + mmu->root_level == PT64_ROOT_LEVEL)) { int lvl = walker->level; walker->gfn = gpte_to_gfn_lvl(pte, lvl); @@ -266,6 +268,14 @@ error: return 0; } +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault, int fetch_fault) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, + write_fault, user_fault, fetch_fault); +} + static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { -- cgit v1.2.3 From c30a358d33e0e111f06e54a4a4125371e6b6693c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:48 +0200 Subject: KVM: MMU: Add infrastructure for two-level page walker This patch introduces a mmu-callback to translate gpa addresses in the walk_addr code. This is later used to translate l2_gpa addresses into l1_gpa addresses. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_host.h | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3fde5b322534..4915b7c8f2ec 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -243,6 +243,7 @@ struct kvm_mmu { void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, u32 *error); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 48b74d2fbfb7..2364c2cad891 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3448,6 +3448,11 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; @@ -5659,6 +5664,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.emulate_ctxt.ops = &emulate_ops; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f2ecdd52032b..917e68ff5ed2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -534,6 +534,11 @@ static inline gpa_t gfn_to_gpa(gfn_t gfn) return (gpa_t)gfn << PAGE_SHIFT; } +static inline gfn_t gpa_to_gfn(gpa_t gpa) +{ + return (gfn_t)(gpa >> PAGE_SHIFT); +} + static inline hpa_t pfn_to_hpa(pfn_t pfn) { return (hpa_t)pfn << PAGE_SHIFT; -- cgit v1.2.3 From 14dfe855f978181cd611ec018e5ceba860a98545 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:49 +0200 Subject: KVM: X86: Introduce pointer to mmu context used for gva_to_gpa This patch introduces the walk_mmu pointer which points to the mmu-context currently used for gva_to_gpa translations. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 +++++++++++++ arch/x86/kvm/mmu.c | 10 +++++----- arch/x86/kvm/x86.c | 17 ++++++++++------- 3 files changed, 28 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4915b7c8f2ec..1b3eb8a0a1bc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -286,8 +286,21 @@ struct kvm_vcpu_arch { u64 ia32_misc_enable_msr; bool tpr_access_reporting; + /* + * Paging state of the vcpu + * + * If the vcpu runs in guest mode with two level paging this still saves + * the paging mode of the l1 guest. This context is always used to + * handle faults. + */ struct kvm_mmu mmu; + /* + * Pointer to the mmu context currently used for + * gva_to_gpa translations. + */ + struct kvm_mmu *walk_mmu; + /* * This struct is filled with the necessary information to propagate a * page fault into the guest diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 99367274b97c..cb06adac92b1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2708,7 +2708,7 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu, static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.walk_mmu; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; @@ -2767,11 +2767,11 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { - int r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); - vcpu->arch.mmu.set_cr3 = kvm_x86_ops->set_cr3; - vcpu->arch.mmu.get_cr3 = get_cr3; - vcpu->arch.mmu.inject_page_fault = kvm_inject_page_fault; + vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; + vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2364c2cad891..4196fc719142 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3456,27 +3456,27 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, @@ -3487,7 +3487,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3542,8 +3543,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -5663,6 +5665,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm = vcpu->kvm; vcpu->arch.emulate_ctxt.ops = &emulate_ops; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) -- cgit v1.2.3 From 6539e738f65a8f1fc7806295d5d701fba4008343 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:50 +0200 Subject: KVM: MMU: Implement nested gva_to_gpa functions This patch adds the functions to do a nested l2_gva to l1_gpa page table walk. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 ++++++++++ arch/x86/kvm/mmu.c | 8 ++++++++ arch/x86/kvm/paging_tmpl.h | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/x86.h | 5 +++++ 4 files changed, 54 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1b3eb8a0a1bc..8ec3547c433d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -295,6 +295,16 @@ struct kvm_vcpu_arch { */ struct kvm_mmu mmu; + /* + * Paging state of an L2 guest (used for nested npt) + * + * This context will save all necessary information to walk page tables + * of the an L2 guest. This context is only initialized for page table + * walking and not for faulting since we never handle l2 page faults on + * the host. + */ + struct kvm_mmu nested_mmu; + /* * Pointer to the mmu context currently used for * gva_to_gpa translations. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb06adac92b1..1e215e8b9377 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2466,6 +2466,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, return vaddr; } +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + if (error) + *error = 0; + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a704a8130e44..eefe363156b9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -276,6 +276,16 @@ static int FNAME(walk_addr)(struct guest_walker *walker, write_fault, user_fault, fetch_fault); } +static int FNAME(walk_addr_nested)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + int write_fault, int user_fault, + int fetch_fault) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, + addr, write_fault, user_fault, + fetch_fault); +} + static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { @@ -660,6 +670,27 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, return gpa; } +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, + access & PFERR_WRITE_MASK, + access & PFERR_USER_MASK, + access & PFERR_FETCH_MASK); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } else if (error) + *error = walker.error_code; + + return gpa; +} + static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2d6385e44ccf..bf4dc2f40d7f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; +} + static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); -- cgit v1.2.3 From ec92fe44e7ff94d04d8305e49efcffd8773e1cf6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:51 +0200 Subject: KVM: X86: Add kvm_read_guest_page_mmu function This patch adds a function which can read from the guests physical memory or from the guest's guest physical memory. This will be used in the two-dimensional page table walker. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/x86.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8ec3547c433d..08bc383083ff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -657,6 +657,9 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu); +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t gfn, void *data, int offset, int len, + u32 access); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4196fc719142..a2efb70f4cc8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -369,6 +369,29 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) } EXPORT_SYMBOL_GPL(kvm_require_cpl); +/* + * This function will be used to read from the physical memory of the currently + * running guest. The difference to kvm_read_guest_page is that this function + * can read from guest physical or from the guest's guest physical memory. + */ +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t ngfn, void *data, int offset, int len, + u32 access) +{ + gfn_t real_gfn; + gpa_t ngpa; + + ngpa = gfn_to_gpa(ngfn); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + if (real_gfn == UNMAPPED_GVA) + return -EFAULT; + + real_gfn = gpa_to_gfn(real_gfn); + + return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); + /* * Load the pae pdptrs. Return true is they are all valid. */ -- cgit v1.2.3 From 2329d46d213d0721dafae18db29f54b196f11468 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:52 +0200 Subject: KVM: MMU: Make walk_addr_generic capable for two-level walking This patch uses kvm_read_guest_page_tdp to make the walk_addr_generic functions suitable for two-level page table walking. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index eefe363156b9..f4e09d341e28 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -124,6 +124,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; bool eperm, present, rsvd_fault; + int offset; + u32 access = 0; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); @@ -153,12 +155,14 @@ walk: index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); - pte_gpa = gfn_to_gpa(table_gfn); - pte_gpa += index * sizeof(pt_element_t); + offset = index * sizeof(pt_element_t); + pte_gpa = gfn_to_gpa(table_gfn) + offset; walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { + if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, + offset, sizeof(pte), + PFERR_USER_MASK|PFERR_WRITE_MASK)) { present = false; break; } @@ -209,15 +213,27 @@ walk: is_large_pte(pte) && mmu->root_level == PT64_ROOT_LEVEL)) { int lvl = walker->level; + gpa_t real_gpa; + gfn_t gfn; - walker->gfn = gpte_to_gfn_lvl(pte, lvl); - walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) - >> PAGE_SHIFT; + gfn = gpte_to_gfn_lvl(pte, lvl); + gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) - walker->gfn += pse36_gfn_delta(pte); + gfn += pse36_gfn_delta(pte); + + access |= write_fault ? PFERR_WRITE_MASK : 0; + access |= fetch_fault ? PFERR_FETCH_MASK : 0; + access |= user_fault ? PFERR_USER_MASK : 0; + + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), + access); + if (real_gpa == UNMAPPED_GVA) + return 0; + + walker->gfn = real_gpa >> PAGE_SHIFT; break; } -- cgit v1.2.3 From 3d06b8bfd44ec421c386241f7c5af66c8200cbf4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:53 +0200 Subject: KVM: MMU: Introduce kvm_read_nested_guest_page() This patch introduces the kvm_read_guest_page_x86 function which reads from the physical memory of the guest. If the guest is running in guest-mode itself with nested paging enabled it will read from the guest's guest physical memory instead. The patch also changes changes the code to use this function where it is necessary. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a2efb70f4cc8..46843ed36dc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -392,6 +392,13 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); +int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, + void *data, int offset, int len, u32 access) +{ + return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, + data, offset, len, access); +} + /* * Load the pae pdptrs. Return true is they are all valid. */ @@ -403,8 +410,9 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) int ret; u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte)); + ret = kvm_read_nested_guest_page(vcpu, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte), + PFERR_USER_MASK|PFERR_WRITE_MASK); if (ret < 0) { ret = 0; goto out; @@ -433,6 +441,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; bool changed = true; + int offset; + gfn_t gfn; int r; if (is_long_mode(vcpu) || !is_pae(vcpu)) @@ -442,7 +452,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; + offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); + r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), + PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) goto out; changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; -- cgit v1.2.3 From 02f59dc9f1f51d2148d87d48f84adb455a4fd697 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:54 +0200 Subject: KVM: MMU: Introduce init_kvm_nested_mmu() This patch introduces the init_kvm_nested_mmu() function which is used to re-initialize the nested mmu when the l2 guest changes its paging mode. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 37 ++++++++++++++++++++++++++++++++++++- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/x86.c | 17 +++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1e215e8b9377..a26f13bd34e0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2784,11 +2784,46 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) return r; } +static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; + + g_context->get_cr3 = get_cr3; + g_context->inject_page_fault = kvm_inject_page_fault; + + /* + * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The + * translation of l2_gpa to l1_gpa addresses is done using the + * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa + * functions between mmu and nested_mmu are swapped. + */ + if (!is_paging(vcpu)) { + g_context->root_level = 0; + g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + } else if (is_long_mode(vcpu)) { + reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); + g_context->root_level = PT64_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else if (is_pae(vcpu)) { + reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); + g_context->root_level = PT32E_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else { + reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); + g_context->root_level = PT32_ROOT_LEVEL; + g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + } + + return 0; +} + static int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = bad_pfn; - if (tdp_enabled) + if (mmu_is_nested(vcpu)) + return init_kvm_nested_mmu(vcpu); + else if (tdp_enabled) return init_kvm_tdp_mmu(vcpu); else return init_kvm_softmmu(vcpu); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 7086ca85d3e7..513abbb5ff46 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -47,6 +47,7 @@ #define PFERR_USER_MASK (1U << 2) #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) +#define PFERR_NESTED_MASK (1U << 31) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 46843ed36dc1..e4c76bf86081 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3489,6 +3489,22 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) return gpa; } +static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + gpa_t t_gpa; + u32 error; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* NPT walks are always user-walks */ + access |= PFERR_USER_MASK; + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); + if (t_gpa == UNMAPPED_GVA) + vcpu->arch.fault.error_code |= PFERR_NESTED_MASK; + + return t_gpa; +} + gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; @@ -5704,6 +5720,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else -- cgit v1.2.3 From d4f8cf664e4c1fd579df6b6e6378335c9f79d790 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:55 +0200 Subject: KVM: MMU: Propagate the right fault back to the guest after gva_to_gpa This patch implements logic to make sure that either a page-fault/page-fault-vmexit or a nested-page-fault-vmexit is propagated back to the guest. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 08bc383083ff..574db6d1532a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -660,6 +660,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); +void kvm_propagate_fault(struct kvm_vcpu *vcpu); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e4c76bf86081..0281d920e9ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -338,6 +338,22 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu) kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } +void kvm_propagate_fault(struct kvm_vcpu *vcpu) +{ + u32 nested, error; + + error = vcpu->arch.fault.error_code; + nested = error & PFERR_NESTED_MASK; + error = error & ~PFERR_NESTED_MASK; + + vcpu->arch.fault.error_code = error; + + if (mmu_is_nested(vcpu) && !nested) + vcpu->arch.nested_mmu.inject_page_fault(vcpu); + else + vcpu->arch.mmu.inject_page_fault(vcpu); +} + void kvm_inject_nmi(struct kvm_vcpu *vcpu) { vcpu->arch.nmi_pending = 1; @@ -4140,7 +4156,7 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; if (ctxt->exception == PF_VECTOR) - kvm_inject_page_fault(vcpu); + kvm_propagate_fault(vcpu); else if (ctxt->error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); else -- cgit v1.2.3 From d47f00a62b2e14b4a811b87bdb9ea1809693a377 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:56 +0200 Subject: KVM: X86: Propagate fetch faults KVM currently ignores fetch faults in the instruction emulator. With nested-npt we could have such faults. This patch adds the code to handle these. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 3 +++ arch/x86/kvm/x86.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2b08b78b6cab..aead72e141b4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1198,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, *(unsigned long *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt); + return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0281d920e9ed..3101060033ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4247,6 +4247,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.perm_ok = false; r = x86_decode_insn(&vcpu->arch.emulate_ctxt); + if (r == X86EMUL_PROPAGATE_FAULT) + goto done; + trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD @@ -4305,6 +4308,7 @@ restart: return handle_emulation_failure(vcpu); } +done: if (vcpu->arch.emulate_ctxt.exception >= 0) { inject_emulated_exception(vcpu); r = EMULATE_DONE; -- cgit v1.2.3 From ff03a073e715d49b5cfeeec862649b1df2481ae0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:57 +0200 Subject: KVM: MMU: Add kvm_mmu parameter to load_pdptrs function This function need to be able to load the pdptrs from any mmu context currently in use. So change this function to take an kvm_mmu parameter to fit these needs. As a side effect this patch also moves the cached pdptrs from vcpu_arch into the kvm_mmu struct. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 5 +++-- arch/x86/kvm/kvm_cache_regs.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 16 ++++++++-------- arch/x86/kvm/x86.c | 26 ++++++++++++++------------ 5 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 574db6d1532a..9e70de376544 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -257,6 +257,8 @@ struct kvm_mmu { u64 *pae_root; u64 rsvd_bits_mask[2][4]; + + u64 pdptrs[4]; /* pae */ }; struct kvm_vcpu_arch { @@ -276,7 +278,6 @@ struct kvm_vcpu_arch { unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; - u64 pdptrs[4]; /* pae */ u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ @@ -592,7 +593,7 @@ void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6491ac8e755b..a37abe2ec39a 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -42,7 +42,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); - return vcpu->arch.pdptrs[index]; + return vcpu->arch.walk_mmu->pdptrs[index]; } static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 53c9039583fd..ca711cb27a19 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1010,7 +1010,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) switch (reg) { case VCPU_EXREG_PDPTR: BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); break; default: BUG(); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ff7a8d48fd24..1a7691a87178 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1842,20 +1842,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3101060033ae..bbd9f4af4449 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -418,17 +418,17 @@ int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, /* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_nested_guest_page(vcpu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); + ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte), + PFERR_USER_MASK|PFERR_WRITE_MASK); if (ret < 0) { ret = 0; goto out; @@ -442,7 +442,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } ret = 1; - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, @@ -455,7 +455,7 @@ EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; int offset; gfn_t gfn; @@ -474,7 +474,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) goto out; - changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; + changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; out: return changed; @@ -513,7 +513,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.cr3)) return 1; } @@ -602,7 +603,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) + && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) return 1; if (cr4 & X86_CR4_VMXE) @@ -635,7 +636,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) return 1; - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + if (is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; } /* @@ -5422,7 +5424,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); mmu_reset_needed = 1; } -- cgit v1.2.3 From d41d1895eb856b5d1c82f3be106b7a3e75e4216b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:58 +0200 Subject: KVM: MMU: Introduce kvm_pdptr_read_mmu This function is implemented to load the pdptr pointers of the currently running guest (l1 or l2 guest). Therefore it takes care about the current paging mode and can read pdptrs out of l2 guest physical memory. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 7 +++++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/paging_tmpl.h | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index a37abe2ec39a..975bb45329a1 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -45,6 +45,13 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.walk_mmu->pdptrs[index]; } +static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) +{ + load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); + + return mmu->pdptrs[index]; +} + static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a26f13bd34e0..a25173a0d8b9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2398,7 +2398,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read(vcpu, i); + pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f4e09d341e28..a28f09bb76c6 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -137,7 +137,7 @@ walk: #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { - pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); + pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) { present = false; -- cgit v1.2.3 From 651dd37a9ce6fdacdcd75da86619c62111efcbc2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:30:59 +0200 Subject: KVM: MMU: Refactor mmu_alloc_roots function This patch factors out the direct-mapping paths of the mmu_alloc_roots function into a seperate function. This makes it a lot easier to avoid all the unnecessary checks done in the shadow path which may break when running direct. In fact, this patch already fixes a problem when running PAE guests on a PAE shadow page table. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a25173a0d8b9..9cd5a717ede5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2357,42 +2357,77 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + int i; + + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, + 1, ACC_ALL, NULL); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, + PT32_ROOT_LEVEL, 1, ACC_ALL, + NULL); + root = __pa(sp->spt); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + } + } else + BUG(); + + return 0; +} + +static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { int i; gfn_t root_gfn; struct kvm_mmu_page *sp; - int direct = 0; u64 pdptr; root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (mmu_check_root(vcpu, root_gfn)) + return 1; + + /* + * Do we shadow a long mode page table? If so we need to + * write-protect the guests page table root. + */ + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - if (mmu_check_root(vcpu, root_gfn)) - return 1; - if (vcpu->arch.mmu.direct_map) { - direct = 1; - root_gfn = 0; - } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, direct, - ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, + 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } - direct = !is_paging(vcpu); - - if (mmu_check_root(vcpu, root_gfn)) - return 1; + /* + * We shadow a 32 bit page table. This may be a legacy 2-level + * or a PAE 3-level page table. + */ for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2406,16 +2441,11 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = pdptr >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; - } else if (vcpu->arch.mmu.root_level == 0) - root_gfn = 0; - if (vcpu->arch.mmu.direct_map) { - direct = 1; - root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, direct, + PT32_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; @@ -2427,6 +2457,14 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return 0; } +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mmu.direct_map) + return mmu_alloc_direct_roots(vcpu); + else + return mmu_alloc_shadow_roots(vcpu); +} + static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; -- cgit v1.2.3 From 81407ca553c0c852b8cd3f38f3ec362d307f829b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:00 +0200 Subject: KVM: MMU: Allow long mode shadows for legacy page tables Currently the KVM softmmu implementation can not shadow a 32 bit legacy or PAE page table with a long mode page table. This is a required feature for nested paging emulation because the nested page table must alway be in host format. So this patch implements the missing pieces to allow long mode page tables for page table types. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 60 +++++++++++++++++++++++++++++++++++------ 2 files changed, 53 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9e70de376544..bd59b482f1a8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -256,6 +256,7 @@ struct kvm_mmu { bool direct_map; u64 *pae_root; + u64 *lm_root; u64 rsvd_bits_mask[2][4]; u64 pdptrs[4]; /* pae */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cd5a717ede5..dd76765310ce 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1504,6 +1504,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, iterator->addr = addr; iterator->shadow_addr = vcpu->arch.mmu.root_hpa; iterator->level = vcpu->arch.mmu.shadow_root_level; + + if (iterator->level == PT64_ROOT_LEVEL && + vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && + !vcpu->arch.mmu.direct_map) + --iterator->level; + if (iterator->level == PT32E_ROOT_LEVEL) { iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; @@ -2314,7 +2320,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; spin_lock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && + (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || + vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); @@ -2394,10 +2402,10 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { - int i; - gfn_t root_gfn; struct kvm_mmu_page *sp; - u64 pdptr; + u64 pdptr, pm_mask; + gfn_t root_gfn; + int i; root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; @@ -2426,8 +2434,13 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) /* * We shadow a 32 bit page table. This may be a legacy 2-level - * or a PAE 3-level page table. + * or a PAE 3-level page table. In either case we need to be aware that + * the shadow page table may be a PAE or a long mode page table. */ + pm_mask = PT_PRESENT_MASK; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) + pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2451,9 +2464,35 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu.pae_root[i] = root | pm_mask; + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + + /* + * If we shadow a 32 bit page table with a long mode page + * table we enter this path. + */ + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.lm_root == NULL) { + /* + * The additional page necessary for this is only + * allocated on demand. + */ + + u64 *lm_root; + + lm_root = (void*)get_zeroed_page(GFP_KERNEL); + if (lm_root == NULL) + return 1; + + lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + + vcpu->arch.mmu.lm_root = lm_root; + } + + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + } + return 0; } @@ -2470,9 +2509,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) int i; struct kvm_mmu_page *sp; + if (vcpu->arch.mmu.direct_map) + return; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -3253,6 +3295,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); + if (vcpu->arch.mmu.lm_root != NULL) + free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 2d48a985c7bbcd72b4e92e301ea96bf1252ffc61 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:01 +0200 Subject: KVM: MMU: Track NX state in struct kvm_mmu With Nested Paging emulation the NX state between the two MMU contexts may differ. To make sure that always the right fault error code is recorded this patch moves the NX state into struct kvm_mmu so that the code can distinguish between L1 and L2 NX state. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu.c | 16 +++++++++++++++- arch/x86/kvm/paging_tmpl.h | 4 ++-- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bd59b482f1a8..b43686a44877 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -259,6 +259,8 @@ struct kvm_mmu { u64 *lm_root; u64 rsvd_bits_mask[2][4]; + bool nx; + u64 pdptrs[4]; /* pae */ }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dd76765310ce..95cbeed74cf9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2634,6 +2634,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = true; + context->nx = false; return 0; } @@ -2687,7 +2688,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; - if (!is_nx(vcpu)) + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (level) { case PT32_ROOT_LEVEL: @@ -2746,6 +2747,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, struct kvm_mmu *context, int level) { + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, level); ASSERT(is_pae(vcpu)); @@ -2772,6 +2775,8 @@ static int paging64_init_context(struct kvm_vcpu *vcpu, static int paging32_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { + context->nx = false; + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; @@ -2810,19 +2815,24 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; context->inject_page_fault = kvm_inject_page_fault; + context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { + context->nx = false; context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { + context->nx = is_nx(vcpu); reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { + context->nx = is_nx(vcpu); reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { + context->nx = false; reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; @@ -2878,17 +2888,21 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) * functions between mmu and nested_mmu are swapped. */ if (!is_paging(vcpu)) { + g_context->nx = false; g_context->root_level = 0; g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { + g_context->nx = is_nx(vcpu); reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); g_context->root_level = PT64_ROOT_LEVEL; g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { + g_context->nx = is_nx(vcpu); reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); g_context->root_level = PT32E_ROOT_LEVEL; g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { + g_context->nx = false; reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); g_context->root_level = PT32_ROOT_LEVEL; g_context->gva_to_gpa = paging32_gva_to_gpa_nested; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a28f09bb76c6..2bdd843ad63f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -105,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; #if PTTYPE == 64 - if (is_nx(vcpu)) + if (vcpu->arch.mmu.nx) access &= ~(gpte >> PT64_NX_SHIFT); #endif return access; @@ -272,7 +272,7 @@ error: walker->error_code |= PFERR_WRITE_MASK; if (user_fault) walker->error_code |= PFERR_USER_MASK; - if (fetch_fault && is_nx(vcpu)) + if (fetch_fault && mmu->nx) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; -- cgit v1.2.3 From 5bd2edc341d11af175e759a546e4335ba3e0584f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:02 +0200 Subject: KVM: SVM: Implement MMU helper functions for Nested Nested Paging This patch adds the helper functions which will be used in the mmu context for handling nested nested page faults. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ca711cb27a19..9a9a4405b571 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -104,6 +104,8 @@ struct nested_state { u32 intercept_exceptions; u64 intercept; + /* Nested Paging related state */ + u64 nested_cr3; }; #define MSRPM_OFFSETS 16 @@ -1600,6 +1602,34 @@ static int vmmcall_interception(struct vcpu_svm *svm) return 1; } +static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->nested.nested_cr3; +} + +static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, + unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); +} + +static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; + svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; + + nested_svm_vmexit(svm); +} + static int nested_svm_check_permissions(struct vcpu_svm *svm) { if (!(svm->vcpu.arch.efer & EFER_SVME) -- cgit v1.2.3 From 4b16184c1ccafa4b0c188c622ea532fb90e6f5b0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:03 +0200 Subject: KVM: SVM: Initialize Nested Nested MMU context on VMRUN This patch adds code to initialize the Nested Nested Paging MMU context when the L1 guest executes a VMRUN instruction and has nested paging enabled in its VMCB. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/svm.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 95cbeed74cf9..6e248d80e350 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2962,6 +2962,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_unload); static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a9a4405b571..3184772dedfe 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -294,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) force_new_asid(vcpu); } +static int get_npt_level(void) +{ +#ifdef CONFIG_X86_64 + return PT64_ROOT_LEVEL; +#else + return PT32E_ROOT_LEVEL; +#endif +} + static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; @@ -1630,6 +1639,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) nested_svm_vmexit(svm); } +static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r; + + r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + + vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; + vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu.shadow_root_level = get_npt_level(); + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + static int nested_svm_check_permissions(struct vcpu_svm *svm) { if (!(svm->vcpu.arch.efer & EFER_SVME) @@ -1998,6 +2027,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); + svm->nested.nested_cr3 = 0; + /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; svm->vmcb->save.cs = hsave->save.cs; @@ -2024,6 +2055,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_unmap(page); + nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -2071,6 +2103,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) if (vmcb->control.asid == 0) return false; + if (vmcb->control.nested_ctl && !npt_enabled) + return false; + return true; } @@ -2143,6 +2178,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + if (nested_vmcb->control.nested_ctl) { + kvm_mmu_unload(&svm->vcpu); + svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + nested_svm_init_mmu_context(&svm->vcpu); + } + /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; svm->vmcb->save.cs = nested_vmcb->save.cs; @@ -3415,15 +3456,6 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static int get_npt_level(void) -{ -#ifdef CONFIG_X86_64 - return PT64_ROOT_LEVEL; -#else - return PT32E_ROOT_LEVEL; -#endif -} - static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; -- cgit v1.2.3 From 55c5e464fcc28ee763d40561abf2b259131dd703 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:04 +0200 Subject: KVM: SVM: Expect two more candiates for exit_int_info This patch adds INTR and NMI intercepts to the list of expected intercepts with an exit_int_info set. While this can't happen on bare metal it is architectural legal and may happen with KVMs SVM emulation. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3184772dedfe..de1930ee2abb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2991,7 +2991,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && + exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, -- cgit v1.2.3 From 3d4aeaad8bb8f8084a414819934b73ab49c26c92 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:05 +0200 Subject: KVM: SVM: Report Nested Paging support to userspace This patch implements the reporting of the nested paging feature support to userspace. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index de1930ee2abb..36e6c88913dc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3481,6 +3481,10 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) if (svm_has(SVM_FEATURE_NRIP)) entry->edx |= SVM_FEATURE_NRIP; + /* Support NPT for the guest if enabled */ + if (npt_enabled) + entry->edx |= SVM_FEATURE_NPT; + break; } } -- cgit v1.2.3 From 4c62a2dc92518c5adf434df8e5c2283c6762672a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 10 Sep 2010 17:31:06 +0200 Subject: KVM: X86: Report SVM bit to userspace only when supported This patch fixes a bug in KVM where it _always_ reports the support of the SVM feature to userspace. But KVM only supports SVM on AMD hardware and only when it is enabled in the kernel module. This patch fixes the wrong reporting. Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ++++ arch/x86/kvm/x86.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 36e6c88913dc..e0f4da07f987 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3469,6 +3469,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x80000001: + if (nested) + entry->ecx |= (1 << 2); /* Set SVM bit */ + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bbd9f4af4449..3ff0a8ff275c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2209,7 +2209,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 0 /* SKINIT */ | 0 /* WDT */; -- cgit v1.2.3 From b0bc3ee2b54fcea0df42cc9aa05103b1ccd89db0 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 13 Sep 2010 16:45:28 +0200 Subject: KVM: MMU: Fix regression with ept memory types merged into non-ept page tables Commit "KVM: MMU: Make tdp_enabled a mmu-context parameter" made real-mode set ->direct_map, and changed the code that merges in the memory type depend on direct_map instead of tdp_enabled. However, in this case what really matters is tdp, not direct_map, since tdp changes the pte format regardless of whether the mapping is direct or not. As a result, real-mode shadow mappings got corrupted with ept memory types. The result was a huge slowdown, likely due to the cache being disabled. Change it back as the simplest fix for the regression (real fix is to move all that to vmx code, and not use tdp_enabled as a synonym for ept). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6e248d80e350..3ce56bfe056e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1980,7 +1980,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_user_mask; if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; - if (vcpu->arch.mmu.direct_map) + if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); -- cgit v1.2.3 From 3842d135ff246b6543f1df77f5600e12094a6845 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 27 Jul 2010 12:30:24 +0300 Subject: KVM: Check for pending events before attempting injection Instead of blindly attempting to inject an event before each guest entry, check for a possible event first in vcpu->requests. Sites that can trigger event injection are modified to set KVM_REQ_EVENT: - interrupt, nmi window opening - ppr updates - i8259 output changes - local apic irr changes - rflags updates - gif flag set - event set on exit This improves non-injecting entry performance, and sets the stage for non-atomic injection. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 1 + arch/x86/kvm/lapic.c | 13 +++++++++++-- arch/x86/kvm/svm.c | 8 +++++++- arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 41 ++++++++++++++++++++++++++++++++--------- include/linux/kvm_host.h | 1 + 6 files changed, 58 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 6e77471951e8..ab1bb8ff9a8d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s) if (!found) return; + kvm_make_request(KVM_REQ_EVENT, found); kvm_vcpu_kick(found); } } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 77d8c0f4817d..c6f2f159384a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static void apic_update_ppr(struct kvm_lapic *apic) { - u32 tpr, isrv, ppr; + u32 tpr, isrv, ppr, old_ppr; int isr; + old_ppr = apic_get_reg(apic, APIC_PROCPRI); tpr = apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - apic_set_reg(apic, APIC_PROCPRI, ppr); + if (old_ppr != ppr) { + apic_set_reg(apic, APIC_PROCPRI, ppr); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + } } static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) @@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; @@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_EDGE_TRIG; if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1152,6 +1160,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e0f4da07f987..1d2ea65d3537 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2371,6 +2371,7 @@ static int stgi_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); @@ -2763,6 +2764,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -3209,8 +3211,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) + if (svm->vcpu.arch.hflags & HF_IRET_MASK) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } svm->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&svm->vcpu); @@ -3219,6 +3223,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a7691a87178..2ce2e0b13edb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3327,6 +3327,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3339,6 +3340,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + kvm_make_request(KVM_REQ_EVENT, vcpu); + ++vcpu->stat.irq_window_exits; /* @@ -3595,6 +3598,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); ++vcpu->stat.nmi_window_exits; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3828,6 +3832,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) if (!idtv_info_valid) return; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ff0a8ff275c..e7198036db61 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -284,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, u32 prev_nr; int class1, class2; + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (!vcpu->arch.exception.pending) { queue: vcpu->arch.exception.pending = true; @@ -356,6 +358,7 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -2418,6 +2421,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -2571,6 +2575,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -4329,6 +4335,7 @@ done: toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); @@ -4998,6 +5005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; + bool req_event; if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) @@ -5045,8 +5053,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); + req_event = kvm_check_request(KVM_REQ_EVENT, vcpu); + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests || need_resched() || signal_pending(current)) { + if (req_event) + kvm_make_request(KVM_REQ_EVENT, vcpu); atomic_set(&vcpu->guest_mode, 0); smp_wmb(); local_irq_enable(); @@ -5055,17 +5067,19 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - inject_pending_event(vcpu); + if (req_event || req_int_win) { + inject_pending_event(vcpu); - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } } srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -5305,6 +5319,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -5368,6 +5384,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu->arch.mp_state = mp_state->mp_state; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -5389,6 +5406,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -5459,6 +5477,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -5691,6 +5711,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -6001,6 +6023,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 917e68ff5ed2..6022da1490e4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -39,6 +39,7 @@ #define KVM_REQ_KVMCLOCK_UPDATE 8 #define KVM_REQ_KICK 9 #define KVM_REQ_DEACTIVATE_FPU 10 +#define KVM_REQ_EVENT 11 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 -- cgit v1.2.3 From 51aa01d13d4a64422cf8095205fc4a02322aca2c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Jul 2010 14:31:20 +0300 Subject: KVM: VMX: Split up vmx_complete_interrupts() vmx_complete_interrupts() does too much, split it up: - vmx_vcpu_run() gets the "cache important vmcs fields" part - a new vmx_complete_atomic_exit() gets the parts that must be done atomically - a new vmx_recover_nmi_blocking() does what its name says - vmx_complete_interrupts() retains the event injection recovery code This helps in reducing the work done in atomic context. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2ce2e0b13edb..927d8404505a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -125,6 +125,7 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u32 exit_intr_info; u32 idt_vectoring_info; struct shared_msr_entry *guest_msrs; int nmsrs; @@ -3775,18 +3776,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; - u32 idt_vectoring_info = vmx->idt_vectoring_info; - bool unblock_nmi; - u8 vector; - int type; - bool idtv_info_valid; - - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + u32 exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -3801,8 +3793,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) asm("int $2"); kvm_after_handle_nmi(&vmx->vcpu); } +} - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = vmx->exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; @@ -3824,6 +3824,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); +} + +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + u32 idt_vectoring_info = vmx->idt_vectoring_info; + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&vmx->vcpu); @@ -4036,6 +4046,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } -- cgit v1.2.3 From 537b37e2674b7e4390a490e03cae53ca9ca99e30 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 22 Jul 2010 12:54:21 +0300 Subject: KVM: VMX: Move real-mode interrupt injection fixup to vmx_complete_interrupts() This allows reuse of vmx_complete_interrupts() for cancelling injections. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 927d8404505a..541f0d2412b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -182,6 +182,7 @@ static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); +static void fixup_rmode_irq(struct vcpu_vmx *vmx); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3828,11 +3829,15 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - u32 idt_vectoring_info = vmx->idt_vectoring_info; + u32 idt_vectoring_info; u8 vector; int type; bool idtv_info_valid; + if (vmx->rmode.irq.pending) + fixup_rmode_irq(vmx); + + idt_vectoring_info = vmx->idt_vectoring_info; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; @@ -4040,8 +4045,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; -- cgit v1.2.3 From 83422e17c19d61399cab7dbf9bf40ff9af2a7dd2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Jul 2010 14:43:23 +0300 Subject: KVM: VMX: Parameterize vmx_complete_interrupts() for both exit and entry Currently vmx_complete_interrupts() can decode event information from vmx exit fields into the generic kvm event queues. Make it able to decode the information from the entry fields as well by parametrizing it. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 541f0d2412b4..3237f6cc930d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -182,7 +182,7 @@ static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void fixup_rmode_irq(struct vcpu_vmx *vmx); +static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3827,17 +3827,18 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) { - u32 idt_vectoring_info; u8 vector; int type; bool idtv_info_valid; if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); + fixup_rmode_irq(vmx, &idt_vectoring_info); - idt_vectoring_info = vmx->idt_vectoring_info; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; @@ -3865,18 +3866,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + u32 err = vmcs_read32(error_code_field); kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(&vmx->vcpu, vector, @@ -3887,24 +3888,31 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } } +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + /* * Failure to inject an interrupt should give us the information * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs * when fetching the interrupt redirection bitmap in the real-mode * tss, this doesn't happen. So we do it ourselves. */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx) +static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) { vmx->rmode.irq.pending = 0; if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) return; kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; + if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) { + *idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; + *idt_vectoring_info |= INTR_TYPE_EXT_INTR; return; } - vmx->idt_vectoring_info = + *idt_vectoring_info = VECTORING_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | vmx->rmode.irq.vector; -- cgit v1.2.3 From b463a6f744a263fccd7da14db1afdc880371a280 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 20 Jul 2010 15:06:17 +0300 Subject: KVM: Non-atomic interrupt injection Change the interrupt injection code to work from preemptible, interrupts enabled context. This works by adding a ->cancel_injection() operation that undoes an injection in case we were not able to actually enter the guest (this condition could never happen with atomic injection). Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 12 ++++++++++++ arch/x86/kvm/vmx.c | 11 +++++++++++ arch/x86/kvm/x86.c | 36 ++++++++++++++++-------------------- 4 files changed, 40 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b43686a44877..80224bf5d4f8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -552,6 +552,7 @@ struct kvm_x86_ops { void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code, bool reinject); + void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1d2ea65d3537..1a85fc507cf7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3261,6 +3261,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) } } +static void svm_cancel_injection(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + control->exit_int_info = control->event_inj; + control->exit_int_info_err = control->event_inj_err; + control->event_inj = 0; + svm_complete_interrupts(svm); +} + #ifdef CONFIG_X86_64 #define R "r" #else @@ -3631,6 +3642,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, + .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3237f6cc930d..70af3db372d7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3895,6 +3895,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) IDT_VECTORING_ERROR_CODE); } +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(to_vmx(vcpu), + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); +} + /* * Failure to inject an interrupt should give us the information * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs @@ -4348,6 +4358,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e7198036db61..a465bd29f381 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5005,7 +5005,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - bool req_event; if (vcpu->requests) { if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) @@ -5041,6 +5040,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -5053,35 +5067,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); - req_event = kvm_check_request(KVM_REQ_EVENT, vcpu); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests || need_resched() || signal_pending(current)) { - if (req_event) - kvm_make_request(KVM_REQ_EVENT, vcpu); atomic_set(&vcpu->guest_mode, 0); smp_wmb(); local_irq_enable(); preempt_enable(); + kvm_x86_ops->cancel_injection(vcpu); r = 1; goto out; } - if (req_event || req_int_win) { - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } - } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); -- cgit v1.2.3 From 625831a3f40d330c611fe37cf501d80d611921f9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 22 Jul 2010 13:09:54 +0300 Subject: KVM: VMX: Move fixup_rmode_irq() to avoid forward declaration No code changes. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 70af3db372d7..32315935201a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -182,7 +182,6 @@ static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -3827,6 +3826,29 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); } +/* + * Failure to inject an interrupt should give us the information + * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs + * when fetching the interrupt redirection bitmap in the real-mode + * tss, this doesn't happen. So we do it ourselves. + */ +static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) +{ + vmx->rmode.irq.pending = 0; + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) + return; + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); + if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) { + *idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; + *idt_vectoring_info |= INTR_TYPE_EXT_INTR; + return; + } + *idt_vectoring_info = + VECTORING_INFO_VALID_MASK + | INTR_TYPE_EXT_INTR + | vmx->rmode.irq.vector; +} + static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, u32 idt_vectoring_info, int instr_len_field, @@ -3905,29 +3927,6 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) -{ - vmx->rmode.irq.pending = 0; - if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) - return; - kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); - if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - *idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - *idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - *idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx->rmode.irq.vector; -} - #ifdef CONFIG_X86_64 #define R "r" #define Q "q" -- cgit v1.2.3 From 0959ffacf39b1ae7f56072b0c64429ee528100ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 14 Sep 2010 17:46:12 +0200 Subject: KVM: MMU: Don't track nested fault info in error-code This patch moves the detection whether a page-fault was nested or not out of the error code and moves it into a separate variable in the fault struct. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.h | 1 - arch/x86/kvm/x86.c | 14 ++++---------- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 80224bf5d4f8..519d6f784984 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -322,6 +322,7 @@ struct kvm_vcpu_arch { struct { u64 address; unsigned error_code; + bool nested; } fault; /* only needed in kvm_pv_mmu_op() path, but it's hot so diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 513abbb5ff46..7086ca85d3e7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -47,7 +47,6 @@ #define PFERR_USER_MASK (1U << 2) #define PFERR_RSVD_MASK (1U << 3) #define PFERR_FETCH_MASK (1U << 4) -#define PFERR_NESTED_MASK (1U << 31) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a465bd29f381..a51635ee85ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -342,18 +342,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu) void kvm_propagate_fault(struct kvm_vcpu *vcpu) { - u32 nested, error; - - error = vcpu->arch.fault.error_code; - nested = error & PFERR_NESTED_MASK; - error = error & ~PFERR_NESTED_MASK; - - vcpu->arch.fault.error_code = error; - - if (mmu_is_nested(vcpu) && !nested) + if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) vcpu->arch.nested_mmu.inject_page_fault(vcpu); else vcpu->arch.mmu.inject_page_fault(vcpu); + + vcpu->arch.fault.nested = false; } void kvm_inject_nmi(struct kvm_vcpu *vcpu) @@ -3524,7 +3518,7 @@ static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) access |= PFERR_USER_MASK; t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); if (t_gpa == UNMAPPED_GVA) - vcpu->arch.fault.error_code |= PFERR_NESTED_MASK; + vcpu->arch.fault.nested = true; return t_gpa; } -- cgit v1.2.3 From 28e4639adf0c9f26f6bb56149b7ab547bf33bb95 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Sat, 18 Sep 2010 14:38:12 -1000 Subject: KVM: x86: Fix kvmclock bug If preempted after kvmclock values are updated, but before hardware virtualization is entered, the last tsc time as read by the guest is never set. It underflows the next time kvmclock is updated if there has not yet been a successful entry / exit into hardware virt. Fix this by simply setting last_tsc to the newly read tsc value so that any computed nsec advance of kvmclock is nulled. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a51635ee85ec..0b021e16f9dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1095,6 +1095,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; vcpu->hv_clock.flags = 0; /* -- cgit v1.2.3 From f4f510508741680e423524c222f615276ca6222c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Sep 2010 18:44:07 +0200 Subject: KVM: Convert PIC lock from raw spinlock to ordinary spinlock The PIC code used to be called from preempt_disable() context, which wasn't very good for PREEMPT_RT. That is no longer the case, so move back from raw_spinlock_t to spinlock_t. Signed-off-by: Avi Kivity Acked-by: Thomas Gleixner Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8259.c | 6 +++--- arch/x86/kvm/irq.h | 2 +- arch/x86/kvm/x86.c | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index ab1bb8ff9a8d..dd54c5bb2e5e 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { - raw_spin_lock(&s->lock); + spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) @@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s) s->wakeup_needed = false; - raw_spin_unlock(&s->lock); + spin_unlock(&s->lock); if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { @@ -569,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - raw_spin_lock_init(&s->lock); + spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 63c314502993..ba910d149410 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -60,7 +60,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - raw_spinlock_t lock; + spinlock_t lock; bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0b021e16f9dd..3adf6921e88e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3011,18 +3011,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); -- cgit v1.2.3 From a0a07cd2c5fc8703db8a07287cdde3d29a286082 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 20 Sep 2010 10:15:32 +0200 Subject: KVM: SVM: do not generate "external interrupt exit" if other exit is pending Nested SVM checks for external interrupt after injecting nested exception. In case there is external interrupt pending the code generates "external interrupt exit" and overwrites previous exit info. If previously injected exception already generated exit it will be lost. Signed-off-by: Gleb Natapov Acked-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a85fc507cf7..c929d007696d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1707,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) return false; + /* + * if vmexit was already requested (by intercepted exception + * for instance) do not overwrite it with "external interrupt" + * vmexit. + */ + if (svm->nested.exit_required) + return false; + svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; -- cgit v1.2.3 From cb16a7b3872e9a806f16b1f09b59103fafc7b796 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Sat, 18 Sep 2010 08:41:02 +0800 Subject: KVM: MMU: fix counting of rmap entries in rmap_add() It seems that rmap entries are under counted. Signed-off-by: Hillf Danton Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ce56bfe056e..c94c43289f53 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -632,6 +632,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc->sptes[0] = (u64 *)*rmapp; desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; + ++count; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); @@ -644,7 +645,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) - ; + ++count; desc->sptes[i] = spte; } return count; -- cgit v1.2.3 From 4ab8e02404fcbc16beefac66de24dbb2706fe2f3 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 19 Sep 2010 14:34:05 +0200 Subject: KVM: x86 emulator: Expose emulate_int_real() Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 5187dd88019b..b36c6b3fe144 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -260,5 +260,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, bool has_error_code, u32 error_code); - +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ -- cgit v1.2.3 From 63995653ade16deacaea5b49ceaf6376314593ac Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 19 Sep 2010 14:34:06 +0200 Subject: KVM: Add kvm_inject_realmode_interrupt() wrapper This adds a wrapper function kvm_inject_realmode_interrupt() around the emulator function emulate_int_real() to allow real mode interrupt injection. [avi: initialize operand and address sizes before emulating interrupts] [avi: initialize rip for real mode interrupt injection] [avi: clear interrupt pending flag after emulating interrupt injection] Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 29 +++++++++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + 2 files changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3adf6921e88e..7d2880500fa3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4188,6 +4188,35 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); } +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; + + init_emulate_ctxt(vcpu); + + vcpu->arch.emulate_ctxt.decode.op_bytes = 2; + vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + + if (ret != X86EMUL_CONTINUE) + return EMULATE_FAIL; + + vcpu->arch.emulate_ctxt.eip = c->eip; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (irq == NMI_VECTOR) + vcpu->arch.nmi_pending = false; + else + vcpu->arch.interrupt.pending = false; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); + static int handle_emulation_failure(struct kvm_vcpu *vcpu) { ++vcpu->stat.insn_emulation_fail; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index bf4dc2f40d7f..2cea414489f3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -72,6 +72,7 @@ static inline int is_paging(struct kvm_vcpu *vcpu) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); -- cgit v1.2.3 From a92601bb707f6f49fd5563ef3d09928e70cc222e Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 19 Sep 2010 14:34:07 +0200 Subject: KVM: VMX: Emulated real mode interrupt injection Replace the inject-as-software-interrupt hack we currently have with emulated injection. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 65 +++++------------------------------------------------- 1 file changed, 6 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32315935201a..9d3f972aa19c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -155,11 +155,6 @@ struct vcpu_vmx { u32 limit; u32 ar; } tr, es, ds, fs, gs; - struct { - bool pending; - u8 vector; - unsigned rip; - } irq; } rmode; int vpid; bool emulation_required; @@ -1028,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = nr; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (kvm_exception_is_soft(nr)) - vmx->rmode.irq.rip += - vmx->vcpu.arch.event_exit_inst_len; - intr_info |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2816,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (vcpu->arch.interrupt.soft) - vmx->rmode.irq.rip += - vmx->vcpu.arch.event_exit_inst_len; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -2857,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = NMI_VECTOR; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - NMI_VECTOR | INTR_TYPE_SOFT_INTR | - INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -3826,29 +3799,6 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); } -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx, u32 *idt_vectoring_info) -{ - vmx->rmode.irq.pending = 0; - if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) - return; - kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); - if (*idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - *idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - *idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - *idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx->rmode.irq.vector; -} - static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, u32 idt_vectoring_info, int instr_len_field, @@ -3858,9 +3808,6 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, int type; bool idtv_info_valid; - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx, &idt_vectoring_info); - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; -- cgit v1.2.3 From 49e9d557f9b6e9639390b63b645f2def8dde5f1b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 19 Sep 2010 14:34:08 +0200 Subject: KVM: VMX: Respect interrupt window in big real mode If an interrupt is pending, we need to stop emulation so we can inject it. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9d3f972aa19c..28c72da93a1b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3582,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; int ret = 1; + u32 cpu_exec_ctrl; + bool intr_window_requested; + + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (!guest_state_valid(vcpu)) { + if (intr_window_requested + && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + return handle_interrupt_window(&vmx->vcpu); + err = emulate_instruction(vcpu, 0, 0, 0); if (err == EMULATE_DO_MMIO) { -- cgit v1.2.3 From 5f4e3f882731c65b5d64a2ff743fda96eaebb9ee Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Sat, 18 Sep 2010 14:38:13 -1000 Subject: KVM: x86: Make math work for other scales The math in kvm_get_time_scale relies on the fact that NSEC_PER_SEC < 2^32. To use the same function to compute arbitrary time scales, we must extend the first reduction step to shrink the base rate to a 32-bit value, and possibly reduce the scaled rate into a 32-bit as well. Note we must take care to avoid an arithmetic overflow when scaling up the tps32 value (this could not happen with the fixed scaled value of NSEC_PER_SEC, but can happen with scaled rates above 2^31. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7d2880500fa3..6666af840190 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -920,31 +920,35 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) return quotient; } -static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, + s8 *pshift, u32 *pmultiplier) { - uint64_t nsecs = 1000000000LL; + uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = tsc_khz * 1000LL; - while (tps64 > nsecs*2) { + tps64 = base_khz * 1000LL; + scaled64 = scaled_khz * 1000LL; + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000UL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= (uint32_t)nsecs) { - tps32 <<= 1; + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000UL) { + if (scaled64 & 0xffffffff00000000UL || tps32 & 0x80000000) + scaled64 >>= 1; + else + tps32 <<= 1; shift++; } - hv_clock->tsc_shift = shift; - hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + *pshift = shift; + *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __func__, tsc_khz, hv_clock->tsc_shift, - hv_clock->tsc_to_system_mul); + pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", + __func__, base_khz, scaled_khz, shift, *pmultiplier); } static inline u64 get_kernel_ns(void) @@ -1084,7 +1088,9 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) } if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); + kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + &vcpu->hv_clock.tsc_shift, + &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = this_tsc_khz; } -- cgit v1.2.3 From 34c238a1d1832d7b1f655641f52782e86396b30a Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Sat, 18 Sep 2010 14:38:14 -1000 Subject: KVM: x86: Rename timer function This just changes some names to better reflect the usage they will be given. Separated out to keep confusion to a minimum. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 ++++++------ include/linux/kvm_host.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6666af840190..ce57cd899a62 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -892,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_write_guest_time below) to the + * system time (updated by kvm_guest_time_update below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ @@ -1032,7 +1032,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) } EXPORT_SYMBOL_GPL(kvm_write_tsc); -static int kvm_write_guest_time(struct kvm_vcpu *v) +static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; @@ -1052,7 +1052,7 @@ static int kvm_write_guest_time(struct kvm_vcpu *v) local_irq_restore(flags); if (unlikely(this_tsc_khz == 0)) { - kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } @@ -1128,7 +1128,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v) if (!vcpu->time_page) return 0; - kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } @@ -5041,8 +5041,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) { - r = kvm_write_guest_time(vcpu); + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { + r = kvm_guest_time_update(vcpu); if (unlikely(r)) goto out; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6022da1490e4..0b89d008db65 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -36,7 +36,7 @@ #define KVM_REQ_PENDING_TIMER 5 #define KVM_REQ_UNHALT 6 #define KVM_REQ_MMU_SYNC 7 -#define KVM_REQ_KVMCLOCK_UPDATE 8 +#define KVM_REQ_CLOCK_UPDATE 8 #define KVM_REQ_KICK 9 #define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_EVENT 11 -- cgit v1.2.3 From c285545f813d7b0ce989fd34e42ad1fe785dc65d Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Sat, 18 Sep 2010 14:38:15 -1000 Subject: KVM: x86: TSC catchup mode Negate the effects of AN TYM spell while kvm thread is preempted by tracking conversion factor to the highest TSC rate and catching the TSC up when it has fallen behind the kernel view of time. Note that once triggered, we don't turn off catchup mode. A slightly more clever version of this is possible, which only does catchup when TSC rate drops, and which specifically targets only CPUs with broken TSC, but since these all are considered unstable_tsc(), this patch covers all necessary cases. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 6 +++ arch/x86/kvm/x86.c | 87 +++++++++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 519d6f784984..9e6fe391094e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -384,6 +384,9 @@ struct kvm_vcpu_arch { u64 last_host_tsc; u64 last_guest_tsc; u64 last_kernel_ns; + u64 last_tsc_nsec; + u64 last_tsc_write; + bool tsc_catchup; bool nmi_pending; bool nmi_injected; @@ -444,6 +447,9 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; + u32 virtual_tsc_khz; + u32 virtual_tsc_mult; + s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce57cd899a62..bfcf8fd5e080 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -962,6 +962,7 @@ static inline u64 get_kernel_ns(void) } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +unsigned long max_tsc_khz; static inline int kvm_tsc_changes_freq(void) { @@ -985,6 +986,24 @@ static inline u64 nsec_to_cycles(u64 nsec) return ret; } +static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +{ + /* Compute a scale to convert nanoseconds in TSC cycles */ + kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + &kvm->arch.virtual_tsc_shift, + &kvm->arch.virtual_tsc_mult); + kvm->arch.virtual_tsc_khz = this_tsc_khz; +} + +static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) +{ + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + vcpu->kvm->arch.virtual_tsc_mult, + vcpu->kvm->arch.virtual_tsc_shift); + tsc += vcpu->arch.last_tsc_write; + return tsc; +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; @@ -1029,6 +1048,8 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; + vcpu->arch.last_tsc_write = data; + vcpu->arch.last_tsc_nsec = ns; } EXPORT_SYMBOL_GPL(kvm_write_tsc); @@ -1041,21 +1062,41 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp; - if ((!vcpu->time_page)) - return 0; - /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - local_irq_restore(flags); if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; } + /* + * We may have to catch up the TSC to match elapsed wall clock + * time for two reasons, even if kvmclock is used. + * 1) CPU could have been running below the maximum TSC rate + * 2) Broken TSC compensation resets the base at each VCPU + * entry to avoid unknown leaps of TSC even when running + * again on the same CPU. This may cause apparent elapsed + * time to disappear, and the guest to stand still or run + * very slowly. + */ + if (vcpu->tsc_catchup) { + u64 tsc = compute_guest_tsc(v, kernel_ns); + if (tsc > tsc_timestamp) { + kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + tsc_timestamp = tsc; + } + } + + local_irq_restore(flags); + + if (!vcpu->time_page) + return 0; + /* * Time as measured by the TSC may go backwards when resetting the base * tsc_timestamp. The reason for this is that the TSC resolution is @@ -1122,16 +1163,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; } -static int kvm_request_guest_time_update(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - - if (!vcpu->time_page) - return 0; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; -} - static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -1455,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } vcpu->arch.time = data; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -1470,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } - - kvm_request_guest_time_update(vcpu); break; } case MSR_IA32_MCG_CTL: @@ -2028,9 +2058,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (check_tsc_unstable()) + if (check_tsc_unstable()) { kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); - kvm_migrate_timers(vcpu); + vcpu->arch.tsc_catchup = 1; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) + kvm_migrate_timers(vcpu); vcpu->cpu = cpu; } } @@ -4461,8 +4495,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - if (!kvm_request_guest_time_update(vcpu)) - continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != smp_processor_id()) send_ipi = 1; } @@ -4517,11 +4550,20 @@ static void kvm_timer_init(void) { int cpu; + max_tsc_khz = tsc_khz; register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { +#ifdef CONFIG_CPU_FREQ + struct cpufreq_policy policy; + memset(&policy, 0, sizeof(policy)); + cpufreq_get_policy(&policy, get_cpu()); + if (policy.cpuinfo.max_freq) + max_tsc_khz = policy.cpuinfo.max_freq; +#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } + pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); for_each_online_cpu(cpu) smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); } @@ -5752,7 +5794,7 @@ int kvm_arch_hardware_enable(void *garbage) list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) if (vcpu->cpu == smp_processor_id()) - kvm_request_guest_time_update(vcpu); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return kvm_x86_ops->hardware_enable(garbage); } @@ -5803,6 +5845,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); + if (!kvm->arch.virtual_tsc_khz) + kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; -- cgit v1.2.3 From 19b6a85b78a5d4b466c537bdbf0eaecae5e2c4e2 Mon Sep 17 00:00:00 2001 From: Arjan Koers <0h61vkll2ly8@xutrox.com> Date: Mon, 2 Aug 2010 23:35:28 +0200 Subject: KVM guest: Move a printk that's using the clock before it's ready Fix a hang during SMP kernel boot on KVM that showed up after commit 489fb490dbf8dab0249ad82b56688ae3842a79e8 (2.6.35) and 59aab522154a2f17b25335b63c1cf68a51fb6ae0 (2.6.34.1). The problem only occurs when CONFIG_PRINTK_TIME is set. KVM-Stable-Tag. Signed-off-by: Arjan Koers <0h61vkll2ly8@xutrox.com> Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index eb9b76c716c2..ca43ce31a19c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -128,13 +128,15 @@ static struct clocksource kvm_clock = { static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); - int low, high; + int low, high, ret; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(msr_kvm_system_time, low, high); + return ret; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3 From 07d6f555d536aad1d74bb8b41dae9385007ecc26 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 28 Sep 2010 16:37:42 +0200 Subject: KVM: VMX: Add AX to list of registers clobbered by guest switch By chance this caused no harm so far. We overwrite AX during switch to/from guest context, so we must declare this. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 28c72da93a1b..007be8402efb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4007,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" - , R"bx", R"di", R"si" + , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #endif -- cgit v1.2.3 From 50933623e50d8730cc1a65853c153b3b4c93b629 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 26 Sep 2010 13:00:53 +0200 Subject: KVM: x86: Fix constant type in kvm_get_time_scale Older gcc versions complain about the improper type (for x86-32), 4.5 seems to fix this silently. However, we should better use the right type initially. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfcf8fd5e080..ffcb90669ec5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -930,14 +930,14 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, tps64 = base_khz * 1000LL; scaled64 = scaled_khz * 1000LL; - while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000UL) { + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000UL) { - if (scaled64 & 0xffffffff00000000UL || tps32 & 0x80000000) + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { + if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) scaled64 >>= 1; else tps32 <<= 1; -- cgit v1.2.3 From 7129eecac10681f69cb00c0323ee915feceb57eb Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 28 Sep 2010 16:33:32 +0800 Subject: KVM: x86 emulator: Eliminate compilation warning in x86_decode_insn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate: arch/x86/kvm/emulate.c:801: warning: ‘sv’ may be used uninitialized in this function on gcc 4.1.2 Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index aead72e141b4..d0df25d84acd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -798,7 +798,7 @@ done: static void fetch_bit_operand(struct decode_cache *c) { - long sv, mask; + long sv = 0, mask; if (c->dst.type == OP_MEM && c->src.type == OP_REG) { mask = ~(c->dst.bytes * 8 - 1); -- cgit v1.2.3 From 6292757fb0e758748fdb441861f8c50d397de9f0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:02:12 +0800 Subject: KVM: MMU: update 'root_hpa' out of loop in PAE shadow path The value of 'vcpu->arch.mmu.pae_root' is not modified, so we can update 'root_hpa' out of the loop. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c94c43289f53..363004699012 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2393,8 +2393,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } else BUG(); @@ -2466,8 +2466,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.pae_root[i] = root | pm_mask; - vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); /* * If we shadow a 32 bit page table with a long mode page -- cgit v1.2.3 From 20bd40dc6492da293993559555df07d467fd202e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:03:27 +0800 Subject: KVM: MMU: cleanup for error mask set while walk guest page table Small cleanup for set page fault error code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2bdd843ad63f..a83ff3794055 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -224,9 +224,7 @@ walk: is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - access |= write_fault ? PFERR_WRITE_MASK : 0; - access |= fetch_fault ? PFERR_FETCH_MASK : 0; - access |= user_fault ? PFERR_USER_MASK : 0; + access |= write_fault | fetch_fault | user_fault; real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); @@ -268,10 +266,9 @@ error: walker->error_code = 0; if (present) walker->error_code |= PFERR_PRESENT_MASK; - if (write_fault) - walker->error_code |= PFERR_WRITE_MASK; - if (user_fault) - walker->error_code |= PFERR_USER_MASK; + + walker->error_code |= write_fault | user_fault; + if (fetch_fault && mmu->nx) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) @@ -673,9 +670,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, int r; r = FNAME(walk_addr)(&walker, vcpu, vaddr, - !!(access & PFERR_WRITE_MASK), - !!(access & PFERR_USER_MASK), - !!(access & PFERR_FETCH_MASK)); + access & PFERR_WRITE_MASK, + access & PFERR_USER_MASK, + access & PFERR_FETCH_MASK); if (r) { gpa = gfn_to_gpa(walker.gfn); -- cgit v1.2.3 From 33f91edb9211f5c0392071f9eb01958ec69f2193 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:05:00 +0800 Subject: KVM: MMU: set access bit for direct mapping Set access bit while setup up direct page table if it's nonpaing or npt enabled, it's good for CPU's speculate access Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 363004699012..88203fa4ef05 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2240,7 +2240,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, __set_spte(iterator.sptep, __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); + | shadow_user_mask | shadow_x_mask + | shadow_accessed_mask); } } return pt_write; -- cgit v1.2.3 From 98224bf1d1783a25ccede29ab08309424ec8de25 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:06:16 +0800 Subject: KVM: MMU: audit: fix vcpu's spte walking After nested nested paging, it may using long mode to shadow 32/PAE paging guest, so this patch fix it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu_audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index bd2b1be7066e..dcca3e7d7b4e 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -51,7 +51,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); -- cgit v1.2.3 From c42fffe3a3aa8c62b8028fff32d18156f5325c3b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:07:07 +0800 Subject: KVM: MMU: audit: unregister audit tracepoints before module unloaded fix: Call Trace: [] ? kvm_mmu_pte_write+0x229/0x911 [kvm] [] ? gfn_to_memslot+0x39/0xa0 [kvm] [] ? mark_page_dirty+0x16/0x2e [kvm] [] ? kvm_write_guest_page+0x67/0x7f [kvm] [] ? local_clock+0x2a/0x3b [] emulator_write_phys+0x46/0x54 [kvm] ...... Code: Bad RIP value. RIP [] 0xffffffffa0172056 RSP CR2: ffffffffa0172056 Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 88203fa4ef05..afde64ba118d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3355,15 +3355,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) return init_kvm_mmu(vcpu); } -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; @@ -3662,4 +3653,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); #ifdef CONFIG_KVM_MMU_AUDIT #include "mmu_audit.c" +#else +static void mmu_audit_disable(void) { } #endif + +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) +{ + ASSERT(vcpu); + + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); + mmu_audit_disable(); +} -- cgit v1.2.3 From 38904e128778c38809daf44a1dabc7f25fa8d83e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:07:59 +0800 Subject: KVM: MMU: audit: introduce audit_printk to cleanup audit code Introduce audit_printk, and record audit point instead audit name Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu_audit.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index dcca3e7d7b4e..66219afcc91e 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -19,7 +19,11 @@ #include -static const char *audit_msg; +static int audit_point; + +#define audit_printk(fmt, args...) \ + printk(KERN_ERR "audit: (%s) error: " \ + fmt, audit_point_name[audit_point], ##args) typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); @@ -93,21 +97,18 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) if (sp->unsync) { if (level != PT_PAGE_TABLE_LEVEL) { - printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n", - audit_msg, sp, level); + audit_printk("unsync sp: %p level = %d\n", sp, level); return; } if (*sptep == shadow_notrap_nonpresent_pte) { - printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n", - audit_msg, sp); + audit_printk("notrap spte in unsync sp: %p\n", sp); return; } } if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { - printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n", - audit_msg, sp); + audit_printk("notrap spte in direct sp: %p\n", sp); return; } @@ -124,10 +125,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) hpa = pfn << PAGE_SHIFT; if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - "pfn %llx hpa %llx ent %llxn", - audit_msg, vcpu->arch.mmu.root_level, - pfn, hpa, *sptep); + audit_printk("levels %d pfn %llx hpa %llx ent %llxn", + vcpu->arch.mmu.root_level, pfn, hpa, *sptep); } static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) @@ -143,11 +142,9 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) if (!gfn_to_memslot(kvm, gfn)) { if (!printk_ratelimit()) return; - printk(KERN_ERR "%s: no memslot for gfn %llx\n", - audit_msg, gfn); - printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n", - audit_msg, (long int)(sptep - rev_sp->spt), - rev_sp->gfn); + audit_printk("no memslot for gfn %llx\n", gfn); + audit_printk("index %ld of sp (gfn=%llx)\n", + (long int)(sptep - rev_sp->spt), rev_sp->gfn); dump_stack(); return; } @@ -156,8 +153,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) if (!*rmapp) { if (!printk_ratelimit()) return; - printk(KERN_ERR "%s: no rmap for writable spte %llx\n", - audit_msg, *sptep); + audit_printk("no rmap for writable spte %llx\n", *sptep); dump_stack(); } } @@ -198,10 +194,8 @@ void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) spte = rmap_next(kvm, rmapp, NULL); while (spte) { if (is_writable_pte(*spte)) - printk(KERN_ERR "%s: (%s) shadow page has " - "writable mappings: gfn %llx role %x\n", - __func__, audit_msg, sp->gfn, - sp->role.word); + audit_printk("shadow page has writable mappings: gfn " + "%llx role %x\n", sp->gfn, sp->role.word); spte = rmap_next(kvm, rmapp, spte); } } @@ -228,14 +222,14 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) mmu_spte_walk(vcpu, audit_spte); } -static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point) +static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); if (!__ratelimit(&ratelimit_state)) return; - audit_msg = audit_point_name[audit_point]; + audit_point = point; audit_all_active_sps(vcpu->kvm); audit_vcpu_spte(vcpu); } -- cgit v1.2.3 From 6903074c367cfb13166c2974d6a886fdc7a00d21 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 27 Sep 2010 18:09:29 +0800 Subject: KVM: MMU: audit: check whether have unsync sps after root sync After root synced, all unsync sps are synced, this patch add a check to make sure it's no unsync sps in VCPU's page table Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 +++++++++-- arch/x86/kvm/mmu_audit.c | 11 ++++++++++- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index afde64ba118d..ba7e7646fb78 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -53,14 +53,18 @@ enum { AUDIT_PRE_PAGE_FAULT, AUDIT_POST_PAGE_FAULT, AUDIT_PRE_PTE_WRITE, - AUDIT_POST_PTE_WRITE + AUDIT_POST_PTE_WRITE, + AUDIT_PRE_SYNC, + AUDIT_POST_SYNC }; char *audit_point_name[] = { "pre page fault", "post page fault", "pre pte write", - "post pte write" + "post pte write", + "pre sync", + "post sync" }; #undef MMU_DEBUG @@ -2516,6 +2520,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); @@ -2531,6 +2537,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } + trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 66219afcc91e..4aee32c3cf92 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -164,6 +164,14 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) inspect_spte_has_rmap(vcpu->kvm, sptep); } +static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + if (audit_point == AUDIT_POST_SYNC && sp->unsync) + audit_printk("meet unsync sp(%p) after sync root.\n", sp); +} + static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) { int i; @@ -179,7 +187,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) } } -void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) +static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memory_slot *slot; unsigned long *rmapp; @@ -215,6 +223,7 @@ static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) { audit_sptes_have_rmaps(vcpu, sptep, level); audit_mappings(vcpu, sptep, level); + audit_spte_after_sync(vcpu, sptep, level); } static void audit_vcpu_spte(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 3377078027dc54dc2a5acb2efa09587e7ac1cbd9 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 28 Sep 2010 17:03:14 +0800 Subject: KVM: MMU: move access code parsing to FNAME(walk_addr) function Move access code parsing from caller site to FNAME(walk_addr) function Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a83ff3794055..9a5f7bb5f840 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -116,16 +116,18 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gva_t addr, int write_fault, - int user_fault, int fetch_fault) + gva_t addr, u32 access) { pt_element_t pte; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; bool eperm, present, rsvd_fault; - int offset; - u32 access = 0; + int offset, write_fault, user_fault, fetch_fault; + + write_fault = access & PFERR_WRITE_MASK; + user_fault = access & PFERR_USER_MASK; + fetch_fault = access & PFERR_FETCH_MASK; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); @@ -215,6 +217,7 @@ walk: int lvl = walker->level; gpa_t real_gpa; gfn_t gfn; + u32 ac; gfn = gpte_to_gfn_lvl(pte, lvl); gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; @@ -224,10 +227,10 @@ walk: is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - access |= write_fault | fetch_fault | user_fault; + ac = write_fault | fetch_fault | user_fault; real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - access); + ac); if (real_gpa == UNMAPPED_GVA) return 0; @@ -282,21 +285,18 @@ error: } static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) + struct kvm_vcpu *vcpu, gva_t addr, u32 access) { return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, - write_fault, user_fault, fetch_fault); + access); } static int FNAME(walk_addr_nested)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, - int fetch_fault) + u32 access) { return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, - addr, write_fault, user_fault, - fetch_fault); + addr, access); } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -532,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; - int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *sptep; int write_pt = 0; @@ -550,8 +549,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * Look up the guest pte for the faulting address. */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, - fetch_fault); + r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); /* * The page is not mapped by the guest. Let the guest handle it. @@ -669,10 +667,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, - access & PFERR_WRITE_MASK, - access & PFERR_USER_MASK, - access & PFERR_FETCH_MASK); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); @@ -690,10 +685,7 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, - access & PFERR_WRITE_MASK, - access & PFERR_USER_MASK, - access & PFERR_FETCH_MASK); + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); -- cgit v1.2.3 From 7ebaf15eefe7b019def72bd9d4420c7bc51ed69e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 3 Oct 2010 18:51:39 +0200 Subject: KVM: MMU: Avoid sign extension in mmu_alloc_direct_roots() pae root address Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba7e7646fb78..dc1b4fb299b7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2374,7 +2374,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; - int i; + unsigned i; if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { spin_lock(&vcpu->kvm->mmu_lock); -- cgit v1.2.3 From 395c6b0a9d56fe7fdb7aeda12795d0eb02475d24 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 4 Oct 2010 12:55:49 +0200 Subject: KVM: Disable interrupts around get_kernel_ns() get_kernel_ns() wants preemption disabled. It doesn't make a lot of sense during the get/set ioctls (no way to make them non-racy) but the callee wants it. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ffcb90669ec5..e96038e1bc3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3469,8 +3469,10 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; + local_irq_disable(); now_ns = get_kernel_ns(); delta = user_ns.clock - now_ns; + local_irq_enable(); kvm->arch.kvmclock_offset = delta; break; } @@ -3478,8 +3480,10 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; + local_irq_disable(); now_ns = get_kernel_ns(); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + local_irq_enable(); user_ns.flags = 0; r = -EFAULT; -- cgit v1.2.3 From 9611c187774f0e20c258c23ced2599c44bd2fef4 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Wed, 6 Oct 2010 14:23:22 +0200 Subject: KVM: fix typo in copyright notice Fix typo in copyright notice. Signed-off-by: Nicolas Kaiser Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/i8259.c | 2 +- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/mmu_audit.c | 2 +- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/timer.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- virt/kvm/irq_comm.c | 2 +- virt/kvm/kvm_main.c | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d0df25d84acd..38b6e8dafaff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,7 +9,7 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Avi Kivity * Yaniv Kamay diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 2ad40a4ddc34..efad72385058 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,7 +5,7 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index dd54c5bb2e5e..f628234fbeca 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index f994da40ad94..7e06ba1618bd 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,7 +1,7 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c6f2f159384a..82118087d9e9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,7 +5,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Authors: * Dor Laor diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index dc1b4fb299b7..eb65b9c5ea40 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,7 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 4aee32c3cf92..ba2bcdde6221 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -4,7 +4,7 @@ * Audit code for KVM MMU * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9a5f7bb5f840..cd7a833a3b52 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,7 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c929d007696d..82e144a4e514 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,7 +4,7 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index e16a0dbe74d8..fc7a101c4a35 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -6,7 +6,7 @@ * * timer support * - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 007be8402efb..8da0e45ff7c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,7 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e96038e1bc3a..dcee64e4434f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,7 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 369e38010ad5..8edca9141b78 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -17,7 +17,7 @@ * Authors: * Yaozu (Eddie) Dong * - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. */ #include diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b8499f544e1d..1aeeb7fbe2ef 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5,7 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity -- cgit v1.2.3 From 5854dbca9b235f8cdd414a0961018763d2d5bf77 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 8 Oct 2010 16:24:14 +0800 Subject: KVM: MCE: Add MCG_SER_P into KVM_MCE_CAP_SUPPORTED Now we have MCG_SER_P (and corresponding SRAO/SRAR MCE) support in kernel and QEMU-KVM, the MCG_SER_P should be added into KVM_MCE_CAP_SUPPORTED to make all these code really works. Reported-by: Dean Nelson Signed-off-by: Huang Ying Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dcee64e4434f..2e090784863a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -73,7 +73,7 @@ #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P +#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) /* EFER defaults: * - enable syscall per default because its emulated by KVM -- cgit v1.2.3 From 77db5cbd29b7cb0e0fb4fd146e7f7ac2831a025a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 8 Oct 2010 16:24:15 +0800 Subject: KVM: MCE: Send SRAR SIGBUS directly Originally, SRAR SIGBUS is sent to QEMU-KVM via touching the poisoned page. But commit 96054569190bdec375fe824e48ca1f4e3b53dd36 prevents the signal from being sent. So now the signal is sent via force_sig_info_fault directly. [marcelo: use send_sig_info instead] Reported-by: Dean Nelson Signed-off-by: Huang Ying Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eb65b9c5ea40..908ea5464a51 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2251,22 +2251,24 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return pt_write; } -static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) { - char buf[1]; - void __user *hva; - int r; + siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_MCEERR_AR; + info.si_addr = (void __user *)address; + info.si_addr_lsb = PAGE_SHIFT; - /* Touch the page, so send SIGBUS */ - hva = (void __user *)gfn_to_hva(kvm, gfn); - r = copy_from_user(buf, hva, 1); + send_sig_info(SIGBUS, &info, tsk); } static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); if (is_hwpoison_pfn(pfn)) { - kvm_send_hwpoison_signal(kvm, gfn); + kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); return 0; } else if (is_fault_pfn(pfn)) return -EFAULT; -- cgit v1.2.3