diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-12 09:32:26 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-12 09:32:26 -0700 |
commit | 7f3591cfacf2d79c4f42238e46c7d053da8e020d (patch) | |
tree | f2e9ed7b6b0bc176facaa49846734790023a6b16 | |
parent | 16ffc3eeaa00d513b0076b7b2b96419f28acc912 (diff) | |
parent | d1f0132e76a11b05167313c606a853953f416081 (diff) | |
download | lwn-7f3591cfacf2d79c4f42238e46c7d053da8e020d.tar.gz lwn-7f3591cfacf2d79c4f42238e46c7d053da8e020d.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (31 commits)
lguest: add support for indirect ring entries
lguest: suppress notifications in example Launcher
lguest: try to batch interrupts on network receive
lguest: avoid sending interrupts to Guest when no activity occurs.
lguest: implement deferred interrupts in example Launcher
lguest: remove obsolete LHREQ_BREAK call
lguest: have example Launcher service all devices in separate threads
lguest: use eventfds for device notification
eventfd: export eventfd_signal and eventfd_fget for lguest
lguest: allow any process to send interrupts
lguest: PAE fixes
lguest: PAE support
lguest: Add support for kvm_hypercall4()
lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD
lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated
lguest: map switcher with executable page table entries
lguest: fix writev returning short on console output
lguest: clean up length-used value in example launcher
lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition.
lguest: beyond ARRAY_SIZE of cpu->arch.gdt
...
-rw-r--r-- | Documentation/lguest/Makefile | 3 | ||||
-rw-r--r-- | Documentation/lguest/lguest.c | 1008 | ||||
-rw-r--r-- | Documentation/lguest/lguest.txt | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 15 | ||||
-rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 1 | ||||
-rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 158 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 60 | ||||
-rw-r--r-- | drivers/lguest/Kconfig | 2 | ||||
-rw-r--r-- | drivers/lguest/core.c | 30 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 14 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 57 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 28 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 127 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 396 | ||||
-rw-r--r-- | drivers/lguest/segments.c | 2 | ||||
-rw-r--r-- | fs/eventfd.c | 3 | ||||
-rw-r--r-- | include/linux/lguest.h | 4 | ||||
-rw-r--r-- | include/linux/lguest_launcher.h | 3 | ||||
-rw-r--r-- | kernel/sched.c | 1 |
21 files changed, 1103 insertions, 818 deletions
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 1f4f9e888bd1..28c8cdfcafd8 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile @@ -1,6 +1,5 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. -CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE -LDLIBS:=-lz +CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE all: lguest diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index d36fcc0f2715..9ebcd6ef361b 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -16,6 +16,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <sys/wait.h> +#include <sys/eventfd.h> #include <fcntl.h> #include <stdbool.h> #include <errno.h> @@ -59,7 +60,6 @@ typedef uint8_t u8; /*:*/ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ -#define NET_PEERNUM 1 #define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ @@ -76,19 +76,12 @@ static bool verbose; do { if (verbose) printf(args); } while(0) /*:*/ -/* File descriptors for the Waker. */ -struct { - int pipe[2]; - int lguest_fd; -} waker_fds; - /* The pointer to the start of guest memory. */ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; -/* The pipe for signal hander to write to. */ -static int timeoutpipe[2]; -static unsigned int timeout_usec = 500; +/* The /dev/lguest file descriptor. */ +static int lguest_fd; /* a per-cpu variable indicating whose vcpu is currently running */ static unsigned int __thread cpu_id; @@ -96,11 +89,6 @@ static unsigned int __thread cpu_id; /* This is our list of devices. */ struct device_list { - /* Summary information about the devices in our list: ready to pass to - * select() to ask which need servicing.*/ - fd_set infds; - int max_infd; - /* Counter to assign interrupt numbers. */ unsigned int next_irq; @@ -126,22 +114,21 @@ struct device /* The linked-list pointer. */ struct device *next; - /* The this device's descriptor, as mapped into the Guest. */ + /* The device's descriptor, as mapped into the Guest. */ struct lguest_device_desc *desc; + /* We can't trust desc values once Guest has booted: we use these. */ + unsigned int feature_len; + unsigned int num_vq; + /* The name of this device, for --verbose. */ const char *name; - /* If handle_input is set, it wants to be called when this file - * descriptor is ready. */ - int fd; - bool (*handle_input)(int fd, struct device *me); - /* Any queues attached to this device */ struct virtqueue *vq; - /* Handle status being finalized (ie. feature bits stable). */ - void (*ready)(struct device *me); + /* Is it operational */ + bool running; /* Device-specific data. */ void *priv; @@ -164,22 +151,28 @@ struct virtqueue /* Last available index we saw. */ u16 last_avail_idx; - /* The routine to call when the Guest pings us, or timeout. */ - void (*handle_output)(int fd, struct virtqueue *me, bool timeout); + /* How many are used since we sent last irq? */ + unsigned int pending_used; - /* Outstanding buffers */ - unsigned int inflight; + /* Eventfd where Guest notifications arrive. */ + int eventfd; - /* Is this blocked awaiting a timer? */ - bool blocked; + /* Function for the thread which is servicing this virtqueue. */ + void (*service)(struct virtqueue *vq); + pid_t thread; }; /* Remember the arguments to the program so we can "reboot" */ static char **main_args; -/* Since guest is UP and we don't run at the same time, we don't need barriers. - * But I include them in the code in case others copy it. */ -#define wmb() +/* The original tty settings to restore on exit. */ +static struct termios orig_term; + +/* We have to be careful with barriers: our devices are all run in separate + * threads and so we need to make sure that changes visible to the Guest happen + * in precise order. */ +#define wmb() __asm__ __volatile__("" : : : "memory") +#define mb() __asm__ __volatile__("" : : : "memory") /* Convert an iovec element to the given type. * @@ -245,7 +238,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) static u8 *get_feature_bits(struct device *dev) { return (u8 *)(dev->desc + 1) - + dev->desc->num_vq * sizeof(struct lguest_vqconfig); + + dev->num_vq * sizeof(struct lguest_vqconfig); } /*L:100 The Launcher code itself takes us out into userspace, that scary place @@ -505,99 +498,19 @@ static void concat(char *dst, char *args[]) * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the * entry point for the Guest. */ -static int tell_kernel(unsigned long start) +static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, guest_limit / getpagesize(), start }; - int fd; - verbose("Guest: %p - %p (%#lx)\n", guest_base, guest_base + guest_limit, guest_limit); - fd = open_or_die("/dev/lguest", O_RDWR); - if (write(fd, args, sizeof(args)) < 0) + lguest_fd = open_or_die("/dev/lguest", O_RDWR); + if (write(lguest_fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); - - /* We return the /dev/lguest file descriptor to control this Guest */ - return fd; } /*:*/ -static void add_device_fd(int fd) -{ - FD_SET(fd, &devices.infds); - if (fd > devices.max_infd) - devices.max_infd = fd; -} - -/*L:200 - * The Waker. - * - * With console, block and network devices, we can have lots of input which we - * need to process. We could try to tell the kernel what file descriptors to - * watch, but handing a file descriptor mask through to the kernel is fairly - * icky. - * - * Instead, we clone off a thread which watches the file descriptors and writes - * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host - * stop running the Guest. This causes the Launcher to return from the - * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset - * the LHREQ_BREAK and wake us up again. - * - * This, of course, is merely a different *kind* of icky. - * - * Given my well-known antipathy to threads, I'd prefer to use processes. But - * it's easier to share Guest memory with threads, and trivial to share the - * devices.infds as the Launcher changes it. - */ -static int waker(void *unused) -{ - /* Close the write end of the pipe: only the Launcher has it open. */ - close(waker_fds.pipe[1]); - - for (;;) { - fd_set rfds = devices.infds; - unsigned long args[] = { LHREQ_BREAK, 1 }; - unsigned int maxfd = devices.max_infd; - - /* We also listen to the pipe from the Launcher. */ - FD_SET(waker_fds.pipe[0], &rfds); - if (waker_fds.pipe[0] > maxfd) - maxfd = waker_fds.pipe[0]; - - /* Wait until input is ready from one of the devices. */ - select(maxfd+1, &rfds, NULL, NULL, NULL); - - /* Message from Launcher? */ - if (FD_ISSET(waker_fds.pipe[0], &rfds)) { - char c; - /* If this fails, then assume Launcher has exited. - * Don't do anything on exit: we're just a thread! */ - if (read(waker_fds.pipe[0], &c, 1) != 1) - _exit(0); - continue; - } - - /* Send LHREQ_BREAK command to snap the Launcher out of it. */ - pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); - } - return 0; -} - -/* This routine just sets up a pipe to the Waker process. */ -static void setup_waker(int lguest_fd) -{ - /* This pipe is closed when Launcher dies, telling Waker. */ - if (pipe(waker_fds.pipe) != 0) - err(1, "Creating pipe for Waker"); - - /* Waker also needs to know the lguest fd */ - waker_fds.lguest_fd = lguest_fd; - - if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) - err(1, "Creating Waker"); -} - /* * Device Handling. * @@ -623,49 +536,90 @@ static void *_check_pointer(unsigned long addr, unsigned int size, /* Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're * at the end. */ -static unsigned next_desc(struct virtqueue *vq, unsigned int i) +static unsigned next_desc(struct vring_desc *desc, + unsigned int i, unsigned int max) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ - if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) - return vq->vring.num; + if (!(desc[i].flags & VRING_DESC_F_NEXT)) + return max; /* Check they're not leading us off end of descriptors. */ - next = vq->vring.desc[i].next; + next = desc[i].next; /* Make sure compiler knows to grab that: we don't want it changing! */ wmb(); - if (next >= vq->vring.num) + if (next >= max) errx(1, "Desc next is %u", next); return next; } +/* This actually sends the interrupt for this virtqueue */ +static void trigger_irq(struct virtqueue *vq) +{ + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + /* Don't inform them if nothing used. */ + if (!vq->pending_used) + return; + vq->pending_used = 0; + + /* If they don't want an interrupt, don't send one, unless empty. */ + if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + && lg_last_avail(vq) != vq->vring.avail->idx) + return; + + /* Send the Guest an interrupt tell them we used something up. */ + if (write(lguest_fd, buf, sizeof(buf)) != 0) + err(1, "Triggering irq %i", vq->config.irq); +} + /* This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found, or vq->vring.num (which - * is never a valid descriptor number) if none was found. */ -static unsigned get_vq_desc(struct virtqueue *vq, - struct iovec iov[], - unsigned int *out_num, unsigned int *in_num) + * This function returns the descriptor number found. */ +static unsigned wait_for_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) { - unsigned int i, head; - u16 last_avail; + unsigned int i, head, max; + struct vring_desc *desc; + u16 last_avail = lg_last_avail(vq); + + while (last_avail == vq->vring.avail->idx) { + u64 event; + + /* OK, tell Guest about progress up to now. */ + trigger_irq(vq); + + /* OK, now we need to know about added descriptors. */ + vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; + + /* They could have slipped one in as we were doing that: make + * sure it's written, then check again. */ + mb(); + if (last_avail != vq->vring.avail->idx) { + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + break; + } + + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) + errx(1, "Event read failed?"); + + /* We don't need to be notified again. */ + vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; + } /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail = lg_last_avail(vq); if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); - /* If there's nothing new since last we looked, return invalid. */ - if (vq->vring.avail->idx == last_avail) - return vq->vring.num; - /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; @@ -678,15 +632,28 @@ static unsigned get_vq_desc(struct virtqueue *vq, /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; + max = vq->vring.num; + desc = vq->vring.desc; i = head; + + /* If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. */ + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) + errx(1, "Invalid size for indirect buffer table"); + + max = desc[i].len / sizeof(struct vring_desc); + desc = check_pointer(desc[i].addr, desc[i].len); + i = 0; + } + do { /* Grab the first descriptor, and check it's OK. */ - iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; + iov[*out_num + *in_num].iov_len = desc[i].len; iov[*out_num + *in_num].iov_base - = check_pointer(vq->vring.desc[i].addr, - vq->vring.desc[i].len); + = check_pointer(desc[i].addr, desc[i].len); /* If this is an input descriptor, increment that count. */ - if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) + if (desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { /* If it's an output descriptor, they're all supposed @@ -697,11 +664,10 @@ static unsigned get_vq_desc(struct virtqueue *vq, } /* If we've got too many, that implies a descriptor loop. */ - if (*out_num + *in_num > vq->vring.num) + if (*out_num + *in_num > max) errx(1, "Looped descriptor"); - } while ((i = next_desc(vq, i)) != vq->vring.num); + } while ((i = next_desc(desc, i, max)) != max); - vq->inflight++; return head; } @@ -719,44 +685,20 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) /* Make sure buffer is written before we update index. */ wmb(); vq->vring.used->idx++; - vq->inflight--; -} - -/* This actually sends the interrupt for this virtqueue */ -static void trigger_irq(int fd, struct virtqueue *vq) -{ - unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; - - /* If they don't want an interrupt, don't send one, unless empty. */ - if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) - && vq->inflight) - return; - - /* Send the Guest an interrupt tell them we used something up. */ - if (write(fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", vq->config.irq); + vq->pending_used++; } /* And here's the combo meal deal. Supersize me! */ -static void add_used_and_trigger(int fd, struct virtqueue *vq, - unsigned int head, int len) +static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) { add_used(vq, head, len); - trigger_irq(fd, vq); + trigger_irq(vq); } /* * The Console * - * Here is the input terminal setting we save, and the routine to restore them - * on exit so the user gets their terminal back. */ -static struct termios orig_term; -static void restore_term(void) -{ - tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); -} - -/* We associate some data with the console for our exit hack. */ + * We associate some data with the console for our exit hack. */ struct console_abort { /* How many times have they hit ^C? */ @@ -766,276 +708,275 @@ struct console_abort }; /* This is the routine which handles console input (ie. stdin). */ -static bool handle_console_input(int fd, struct device *dev) +static void console_input(struct virtqueue *vq) { int len; unsigned int head, in_num, out_num; - struct iovec iov[dev->vq->vring.num]; - struct console_abort *abort = dev->priv; - - /* First we need a console buffer from the Guests's input virtqueue. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - - /* If they're not ready for input, stop listening to this file - * descriptor. We'll start again once they add an input buffer. */ - if (head == dev->vq->vring.num) - return false; + struct console_abort *abort = vq->dev->priv; + struct iovec iov[vq->vring.num]; + /* Make sure there's a descriptor waiting. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* This is why we convert to iovecs: the readv() call uses them, and so - * it reads straight into the Guest's buffer. */ - len = readv(dev->fd, iov, in_num); + /* Read it in. */ + len = readv(STDIN_FILENO, iov, in_num); if (len <= 0) { - /* This implies that the console is closed, is /dev/null, or - * something went terribly wrong. */ + /* Ran out of input? */ warnx("Failed to get console input, ignoring console."); - /* Put the input terminal back. */ - restore_term(); - /* Remove callback from input vq, so it doesn't restart us. */ - dev->vq->handle_output = NULL; - /* Stop listening to this fd: don't call us again. */ - return false; + /* For simplicity, dying threads kill the whole Launcher. So + * just nap here. */ + for (;;) + pause(); } - /* Tell the Guest about the new input. */ - add_used_and_trigger(fd, dev->vq, head, len); + add_used_and_trigger(vq, head, len); /* Three ^C within one second? Exit. * - * This is such a hack, but works surprisingly well. Each ^C has to be - * in a buffer by itself, so they can't be too fast. But we check that - * we get three within about a second, so they can't be too slow. */ - if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { - if (!abort->count++) - gettimeofday(&abort->start, NULL); - else if (abort->count == 3) { - struct timeval now; - gettimeofday(&now, NULL); - if (now.tv_sec <= abort->start.tv_sec+1) { - unsigned long args[] = { LHREQ_BREAK, 0 }; - /* Close the fd so Waker will know it has to - * exit. */ - close(waker_fds.pipe[1]); - /* Just in case Waker is blocked in BREAK, send - * unbreak now. */ - write(fd, args, sizeof(args)); - exit(2); - } - abort->count = 0; - } - } else - /* Any other key resets the abort counter. */ + * This is such a hack, but works surprisingly well. Each ^C has to + * be in a buffer by itself, so they can't be too fast. But we check + * that we get three within about a second, so they can't be too + * slow. */ + if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { abort->count = 0; + return; + } - /* Everything went OK! */ - return true; + abort->count++; + if (abort->count == 1) + gettimeofday(&abort->start, NULL); + else if (abort->count == 3) { + struct timeval now; + gettimeofday(&now, NULL); + /* Kill all Launcher processes with SIGINT, like normal ^C */ + if (now.tv_sec <= abort->start.tv_sec+1) + kill(0, SIGINT); + abort->count = 0; + } } -/* Handling output for console is simple: we just get all the output buffers - * and write them to stdout. */ -static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) +/* This is the routine which handles console output (ie. stdout). */ +static void console_output(struct virtqueue *vq) { unsigned int head, out, in; - int len; struct iovec iov[vq->vring.num]; - /* Keep getting output buffers from the Guest until we run out. */ - while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { - if (in) - errx(1, "Input buffers in output queue?"); - len = writev(STDOUT_FILENO, iov, out); - add_used_and_trigger(fd, vq, head, len); + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in console output queue?"); + while (!iov_empty(iov, out)) { + int len = writev(STDOUT_FILENO, iov, out); + if (len <= 0) + err(1, "Write to stdout gave %i", len); + iov_consume(iov, out, len); } -} - -/* This is called when we no longer want to hear about Guest changes to a - * virtqueue. This is more efficient in high-traffic cases, but it means we - * have to set a timer to check if any more changes have occurred. */ -static void block_vq(struct virtqueue *vq) -{ - struct itimerval itm; - - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - vq->blocked = true; - - itm.it_interval.tv_sec = 0; - itm.it_interval.tv_usec = 0; - itm.it_value.tv_sec = 0; - itm.it_value.tv_usec = timeout_usec; - - setitimer(ITIMER_REAL, &itm, NULL); + add_used(vq, head, 0); } /* * The Network * * Handling output for network is also simple: we get all the output buffers - * and write them (ignoring the first element) to this device's file descriptor - * (/dev/net/tun). + * and write them to /dev/net/tun. */ -static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) +struct net_info { + int tunfd; +}; + +static void net_output(struct virtqueue *vq) { - unsigned int head, out, in, num = 0; - int len; + struct net_info *net_info = vq->dev->priv; + unsigned int head, out, in; struct iovec iov[vq->vring.num]; - static int last_timeout_num; - - /* Keep getting output buffers from the Guest until we run out. */ - while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { - if (in) - errx(1, "Input buffers in output queue?"); - len = writev(vq->dev->fd, iov, out); - if (len < 0) - err(1, "Writing network packet to tun"); - add_used_and_trigger(fd, vq, head, len); - num++; - } - /* Block further kicks and set up a timer if we saw anything. */ - if (!timeout && num) - block_vq(vq); - - /* We never quite know how long should we wait before we check the - * queue again for more packets. We start at 500 microseconds, and if - * we get fewer packets than last time, we assume we made the timeout - * too small and increase it by 10 microseconds. Otherwise, we drop it - * by one microsecond every time. It seems to work well enough. */ - if (timeout) { - if (num < last_timeout_num) - timeout_usec += 10; - else if (timeout_usec > 1) - timeout_usec--; - last_timeout_num = num; - } + head = wait_for_vq_desc(vq, iov, &out, &in); + if (in) + errx(1, "Input buffers in net output queue?"); + if (writev(net_info->tunfd, iov, out) < 0) + errx(1, "Write to tun failed?"); + add_used(vq, head, 0); +} + +/* Will reading from this file descriptor block? */ +static bool will_block(int fd) +{ + fd_set fdset; + struct timeval zero = { 0, 0 }; + FD_ZERO(&fdset); + FD_SET(fd, &fdset); + return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This is where we handle a packet coming in from the tun device to our +/* This is where we handle packets coming in from the tun device to our * Guest. */ -static bool handle_tun_input(int fd, struct device *dev) +static void net_input(struct virtqueue *vq) { - unsigned int head, in_num, out_num; int len; - struct iovec iov[dev->vq->vring.num]; - - /* First we need a network buffer from the Guests's recv virtqueue. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - if (head == dev->vq->vring.num) { - /* Now, it's expected that if we try to send a packet too - * early, the Guest won't be ready yet. Wait until the device - * status says it's ready. */ - /* FIXME: Actually want DRIVER_ACTIVE here. */ - - /* Now tell it we want to know if new things appear. */ - dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - wmb(); - - /* We'll turn this back on if input buffers are registered. */ - return false; - } else if (out_num) - errx(1, "Output buffers in network recv queue?"); - - /* Read the packet from the device directly into the Guest's buffer. */ - len = readv(dev->fd, iov, in_num); - if (len <= 0) - err(1, "reading network"); + unsigned int head, out, in; + struct iovec iov[vq->vring.num]; + struct net_info *net_info = vq->dev->priv; - /* Tell the Guest about the new packet. */ - add_used_and_trigger(fd, dev->vq, head, len); + head = wait_for_vq_desc(vq, iov, &out, &in); + if (out) + errx(1, "Output buffers in net input queue?"); - verbose("tun input packet len %i [%02x %02x] (%s)\n", len, - ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], - head != dev->vq->vring.num ? "sent" : "discarded"); + /* Deliver interrupt now, since we're about to sleep. */ + if (vq->pending_used && will_block(net_info->tunfd)) + trigger_irq(vq); - /* All good. */ - return true; + len = readv(net_info->tunfd, iov, in); + if (len <= 0) + err(1, "Failed to read from tun."); + add_used(vq, head, len); } -/*L:215 This is the callback attached to the network and console input - * virtqueues: it ensures we try again, in case we stopped console or net - * delivery because Guest didn't have any buffers. */ -static void enable_fd(int fd, struct virtqueue *vq, bool timeout) +/* This is the helper to create threads. */ +static int do_thread(void *_vq) { - add_device_fd(vq->dev->fd); - /* Snap the Waker out of its select loop. */ - write(waker_fds.pipe[1], "", 1); + struct virtqueue *vq = _vq; + + for (;;) + vq->service(vq); + return 0; } -static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) +/* When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! */ +static void kill_launcher(int signal) { - /* We don't need to know again when Guest refills receive buffer. */ - vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; - enable_fd(fd, vq, timeout); + kill(0, SIGTERM); } -/* When the Guest tells us they updated the status field, we handle it. */ -static void update_device_status(struct device *dev) +static void reset_device(struct device *dev) { struct virtqueue *vq; - /* This is a reset. */ - if (dev->desc->status == 0) { - verbose("Resetting device %s\n", dev->name); + verbose("Resetting device %s\n", dev->name); - /* Clear any features they've acked. */ - memset(get_feature_bits(dev) + dev->desc->feature_len, 0, - dev->desc->feature_len); + /* Clear any features they've acked. */ + memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); - /* Zero out the virtqueues. */ - for (vq = dev->vq; vq; vq = vq->next) { - memset(vq->vring.desc, 0, - vring_size(vq->config.num, LGUEST_VRING_ALIGN)); - lg_last_avail(vq) = 0; + /* We're going to be explicitly killing threads, so ignore them. */ + signal(SIGCHLD, SIG_IGN); + + /* Zero out the virtqueues, get rid of their threads */ + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->thread != (pid_t)-1) { + kill(vq->thread, SIGTERM); + waitpid(vq->thread, NULL, 0); + vq->thread = (pid_t)-1; } - } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { + memset(vq->vring.desc, 0, + vring_size(vq->config.num, LGUEST_VRING_ALIGN)); + lg_last_avail(vq) = 0; + } + dev->running = false; + + /* Now we care if threads die. */ + signal(SIGCHLD, (void *)kill_launcher); +} + +static void create_thread(struct virtqueue *vq) +{ + /* Create stack for thread and run it. Since stack grows + * upwards, we point the stack pointer to the end of this + * region. */ + char *stack = malloc(32768); + unsigned long args[] = { LHREQ_EVENTFD, + vq->config.pfn*getpagesize(), 0 }; + + /* Create a zero-initialized eventfd. */ + vq->eventfd = eventfd(0, 0); + if (vq->eventfd < 0) + err(1, "Creating eventfd"); + args[2] = vq->eventfd; + + /* Attach an eventfd to this virtqueue: it will go off + * when the Guest does an LHCALL_NOTIFY for this vq. */ + if (write(lguest_fd, &args, sizeof(args)) != 0) + err(1, "Attaching eventfd"); + + /* CLONE_VM: because it has to access the Guest memory, and + * SIGCHLD so we get a signal if it dies. */ + vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); + if (vq->thread == (pid_t)-1) + err(1, "Creating clone"); + /* We close our local copy, now the child has it. */ + close(vq->eventfd); +} + +static void start_device(struct device *dev) +{ + unsigned int i; + struct virtqueue *vq; + + verbose("Device %s OK: offered", dev->name); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev)[i]); + verbose(", accepted"); + for (i = 0; i < dev->feature_len; i++) + verbose(" %02x", get_feature_bits(dev) + [dev->feature_len+i]); + + for (vq = dev->vq; vq; vq = vq->next) { + if (vq->service) + create_thread(vq); + } + dev->running = true; +} + +static void cleanup_devices(void) +{ + struct device *dev; + + for (dev = devices.dev; dev; dev = dev->next) + reset_device(dev); + + /* If we saved off the original terminal settings, restore them now. */ + if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) + tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); +} + +/* When the Guest tells us they updated the status field, we handle it. */ +static void update_device_status(struct device *dev) +{ + /* A zero status is a reset, otherwise it's a set of flags. */ + if (dev->desc->status == 0) + reset_device(dev); + else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { warnx("Device %s configuration FAILED", dev->name); + if (dev->running) + reset_device(dev); } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { - unsigned int i; - - verbose("Device %s OK: offered", dev->name); - for (i = 0; i < dev->desc->feature_len; i++) - verbose(" %02x", get_feature_bits(dev)[i]); - verbose(", accepted"); - for (i = 0; i < dev->desc->feature_len; i++) - verbose(" %02x", get_feature_bits(dev) - [dev->desc->feature_len+i]); - - if (dev->ready) - dev->ready(dev); + if (!dev->running) + start_device(dev); } } /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ -static void handle_output(int fd, unsigned long addr) +static void handle_output(unsigned long addr) { struct device *i; - struct virtqueue *vq; - /* Check each device and virtqueue. */ + /* Check each device. */ for (i = devices.dev; i; i = i->next) { + struct virtqueue *vq; + /* Notifications to device descriptors update device status. */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Notifications to virtqueues mean output has occurred. */ + /* Devices *can* be used before status is set to DRIVER_OK. */ for (vq = i->vq; vq; vq = vq->next) { - if (vq->config.pfn != addr/getpagesize()) + if (addr != vq->config.pfn*getpagesize()) continue; - - /* Guest should acknowledge (and set features!) before - * using the device. */ - if (i->desc->status == 0) { - warnx("%s gave early output", i->name); - return; - } - - if (strcmp(vq->dev->name, "console") != 0) - verbose("Output to %s\n", vq->dev->name); - if (vq->handle_output) - vq->handle_output(fd, vq, false); + if (i->running) + errx(1, "Notification on running %s", i->name); + start_device(i); return; } } @@ -1049,71 +990,6 @@ static void handle_output(int fd, unsigned long addr) strnlen(from_guest_phys(addr), guest_limit - addr)); } -static void handle_timeout(int fd) -{ - char buf[32]; - struct device *i; - struct virtqueue *vq; - - /* Clear the pipe */ - read(timeoutpipe[0], buf, sizeof(buf)); - - /* Check each device and virtqueue: flush blocked ones. */ - for (i = devices.dev; i; i = i->next) { - for (vq = i->vq; vq; vq = vq->next) { - if (!vq->blocked) - continue; - - vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - vq->blocked = false; - if (vq->handle_output) - vq->handle_output(fd, vq, true); - } - } -} - -/* This is called when the Waker wakes us up: check for incoming file - * descriptors. */ -static void handle_input(int fd) -{ - /* select() wants a zeroed timeval to mean "don't wait". */ - struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; - - for (;;) { - struct device *i; - fd_set fds = devices.infds; - int num; - - num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); - /* Could get interrupted */ - if (num < 0) - continue; - /* If nothing is ready, we're done. */ - if (num == 0) - break; - - /* Otherwise, call the device(s) which have readable file - * descriptors and a method of handling them. */ - for (i = devices.dev; i; i = i->next) { - if (i->handle_input && FD_ISSET(i->fd, &fds)) { - if (i->handle_input(fd, i)) - continue; - - /* If handle_input() returns false, it means we - * should no longer service it. Networking and - * console do this when there's no input - * buffers to deliver into. Console also uses - * it when it discovers that stdin is closed. */ - FD_CLR(i->fd, &devices.infds); - } - } - - /* Is this the timeout fd? */ - if (FD_ISSET(timeoutpipe[0], &fds)) - handle_timeout(fd); - } -} - /*L:190 * Device Setup * @@ -1129,8 +1005,8 @@ static void handle_input(int fd) static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) - + dev->desc->num_vq * sizeof(struct lguest_vqconfig) - + dev->desc->feature_len * 2; + + dev->num_vq * sizeof(struct lguest_vqconfig) + + dev->feature_len * 2; } /* This routine allocates a new "struct lguest_device_desc" from descriptor @@ -1159,7 +1035,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) /* Each device descriptor is followed by the description of its virtqueues. We * specify how many descriptors the virtqueue is to have. */ static void add_virtqueue(struct device *dev, unsigned int num_descs, - void (*handle_output)(int, struct virtqueue *, bool)) + void (*service)(struct virtqueue *)) { unsigned int pages; struct virtqueue **i, *vq = malloc(sizeof(*vq)); @@ -1174,8 +1050,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; - vq->inflight = 0; - vq->blocked = false; + vq->service = service; + vq->thread = (pid_t)-1; /* Initialize the configuration. */ vq->config.num = num_descs; @@ -1191,6 +1067,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * yet, otherwise we'd be overwriting them. */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); + dev->num_vq++; dev->desc->num_vq++; verbose("Virtqueue page %#lx\n", to_guest_phys(p)); @@ -1199,15 +1076,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, * second. */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; - - /* Set the routine to call when the Guest does something to this - * virtqueue. */ - vq->handle_output = handle_output; - - /* As an optimization, set the advisory "Don't Notify Me" flag if we - * don't have a handler */ - if (!handle_output) - vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; } /* The first half of the feature bitmask is for us to advertise features. The @@ -1219,7 +1087,7 @@ static void add_feature(struct device *dev, unsigned bit) /* We can't extend the feature bits once we've added config bytes */ if (dev->desc->feature_len <= bit / CHAR_BIT) { assert(dev->desc->config_len == 0); - dev->desc->feature_len = (bit / CHAR_BIT) + 1; + dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; } features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); @@ -1243,22 +1111,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf) * calling new_dev_desc() to allocate the descriptor and device memory. * * See what I mean about userspace being boring? */ -static struct device *new_device(const char *name, u16 type, int fd, - bool (*handle_input)(int, struct device *)) +static struct device *new_device(const char *name, u16 type) { struct device *dev = malloc(sizeof(*dev)); /* Now we populate the fields one at a time. */ - dev->fd = fd; - /* If we have an input handler for this file descriptor, then we add it - * to the device_list's fdset and maxfd. */ - if (handle_input) - add_device_fd(dev->fd); dev->desc = new_dev_desc(type); - dev->handle_input = handle_input; dev->name = name; dev->vq = NULL; - dev->ready = NULL; + dev->feature_len = 0; + dev->num_vq = 0; + dev->running = false; /* Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus @@ -1286,13 +1149,10 @@ static void setup_console(void) * raw input stream to the Guest. */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); - /* If we exit gracefully, the original settings will be - * restored so the user can see what they're typing. */ - atexit(restore_term); } - dev = new_device("console", VIRTIO_ID_CONSOLE, - STDIN_FILENO, handle_console_input); + dev = new_device("console", VIRTIO_ID_CONSOLE); + /* We store the console state in dev->priv, and initialize it. */ dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; @@ -1301,31 +1161,13 @@ static void setup_console(void) * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to * stdout. */ - add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); - add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); + add_virtqueue(dev, VIRTQUEUE_NUM, console_input); + add_virtqueue(dev, VIRTQUEUE_NUM, console_output); - verbose("device %u: console\n", devices.device_num++); + verbose("device %u: console\n", ++devices.device_num); } /*:*/ -static void timeout_alarm(int sig) -{ - write(timeoutpipe[1], "", 1); -} - -static void setup_timeout(void) -{ - if (pipe(timeoutpipe) != 0) - err(1, "Creating timeout pipe"); - - if (fcntl(timeoutpipe[1], F_SETFL, - fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) - err(1, "Making timeout pipe nonblocking"); - - add_device_fd(timeoutpipe[0]); - signal(SIGALRM, timeout_alarm); -} - /*M:010 Inter-guest networking is an interesting area. Simplest is to have a * --sharenet=<name> option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. @@ -1447,21 +1289,23 @@ static int get_tun_device(char tapif[IFNAMSIZ]) static void setup_tun_net(char *arg) { struct device *dev; - int netfd, ipfd; + struct net_info *net_info = malloc(sizeof(*net_info)); + int ipfd; u32 ip = INADDR_ANY; bool bridging = false; char tapif[IFNAMSIZ], *p; struct virtio_net_config conf; - netfd = get_tun_device(tapif); + net_info->tunfd = get_tun_device(tapif); /* First we create a new network device. */ - dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); + dev = new_device("net", VIRTIO_ID_NET); + dev->priv = net_info; /* Network devices need a receive and a send queue, just like * console. */ - add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); - add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); + add_virtqueue(dev, VIRTQUEUE_NUM, net_input); + add_virtqueue(dev, VIRTQUEUE_NUM, net_output); /* We need a socket to perform the magic network ioctls to bring up the * tap interface, connect to the bridge etc. Any socket will do! */ @@ -1502,6 +1346,8 @@ static void setup_tun_net(char *arg) add_feature(dev, VIRTIO_NET_F_HOST_TSO4); add_feature(dev, VIRTIO_NET_F_HOST_TSO6); add_feature(dev, VIRTIO_NET_F_HOST_ECN); + /* We handle indirect ring entries */ + add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); set_config(dev, sizeof(conf), &conf); /* We don't need the socket any more; setup is done. */ @@ -1550,20 +1396,18 @@ struct vblk_info * Remember that the block device is handled by a separate I/O thread. We head * straight into the core of that thread here: */ -static bool service_io(struct device *dev) +static void blk_request(struct virtqueue *vq) { - struct vblk_info *vblk = dev->priv; + struct vblk_info *vblk = vq->dev->priv; unsigned int head, out_num, in_num, wlen; int ret; u8 *in; struct virtio_blk_outhdr *out; - struct iovec iov[dev->vq->vring.num]; + struct iovec iov[vq->vring.num]; off64_t off; - /* See if there's a request waiting. If not, nothing to do. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - if (head == dev->vq->vring.num) - return false; + /* Get the next request. */ + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); /* Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one @@ -1637,83 +1481,21 @@ static bool service_io(struct device *dev) if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - /* We can't trigger an IRQ, because we're not the Launcher. It does - * that when we tell it we're done. */ - add_used(dev->vq, head, wlen); - return true; -} - -/* This is the thread which actually services the I/O. */ -static int io_thread(void *_dev) -{ - struct device *dev = _dev; - struct vblk_info *vblk = dev->priv; - char c; - - /* Close other side of workpipe so we get 0 read when main dies. */ - close(vblk->workpipe[1]); - /* Close the other side of the done_fd pipe. */ - close(dev->fd); - - /* When this read fails, it means Launcher died, so we follow. */ - while (read(vblk->workpipe[0], &c, 1) == 1) { - /* We acknowledge each request immediately to reduce latency, - * rather than waiting until we've done them all. I haven't - * measured to see if it makes any difference. - * - * That would be an interesting test, wouldn't it? You could - * also try having more than one I/O thread. */ - while (service_io(dev)) - write(vblk->done_fd, &c, 1); - } - return 0; -} - -/* Now we've seen the I/O thread, we return to the Launcher to see what happens - * when that thread tells us it's completed some I/O. */ -static bool handle_io_finish(int fd, struct device *dev) -{ - char c; - - /* If the I/O thread died, presumably it printed the error, so we - * simply exit. */ - if (read(dev->fd, &c, 1) != 1) - exit(1); - - /* It did some work, so trigger the irq. */ - trigger_irq(fd, dev->vq); - return true; -} - -/* When the Guest submits some I/O, we just need to wake the I/O thread. */ -static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) -{ - struct vblk_info *vblk = vq->dev->priv; - char c = 0; - - /* Wake up I/O thread and tell it to go to work! */ - if (write(vblk->workpipe[1], &c, 1) != 1) - /* Presumably it indicated why it died. */ - exit(1); + add_used(vq, head, wlen); } /*L:198 This actually sets up a virtual block device. */ static void setup_block_file(const char *filename) { - int p[2]; struct device *dev; struct vblk_info *vblk; - void *stack; struct virtio_blk_config conf; - /* This is the pipe the I/O thread will use to tell us I/O is done. */ - pipe(p); - /* The device responds to return from I/O thread. */ - dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); + dev = new_device("block", VIRTIO_ID_BLOCK); /* The device has one virtqueue, where the Guest places requests. */ - add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); + add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); /* Allocate the room for our own bookkeeping */ vblk = dev->priv = malloc(sizeof(*vblk)); @@ -1735,49 +1517,29 @@ static void setup_block_file(const char *filename) set_config(dev, sizeof(conf), &conf); - /* The I/O thread writes to this end of the pipe when done. */ - vblk->done_fd = p[1]; - - /* This is the second pipe, which is how we tell the I/O thread about - * more work. */ - pipe(vblk->workpipe); - - /* Create stack for thread and run it. Since stack grows upwards, we - * point the stack pointer to the end of this region. */ - stack = malloc(32768); - /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from - * becoming a zombie. */ - if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) - err(1, "Creating clone"); - - /* We don't need to keep the I/O thread's end of the pipes open. */ - close(vblk->done_fd); - close(vblk->workpipe[0]); - verbose("device %u: virtblock %llu sectors\n", - devices.device_num, le64_to_cpu(conf.capacity)); + ++devices.device_num, le64_to_cpu(conf.capacity)); } +struct rng_info { + int rfd; +}; + /* Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * * The same logic applies, however. */ -static bool handle_rng_input(int fd, struct device *dev) +static void rng_input(struct virtqueue *vq) { int len; unsigned int head, in_num, out_num, totlen = 0; - struct iovec iov[dev->vq->vring.num]; + struct rng_info *rng_info = vq->dev->priv; + struct iovec iov[vq->vring.num]; /* First we need a buffer from the Guests's virtqueue. */ - head = get_vq_desc(dev->vq, iov, &out_num, &in_num); - - /* If they're not ready for input, stop listening to this file - * descriptor. We'll start again once they add an input buffer. */ - if (head == dev->vq->vring.num) - return false; - + head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in rng?"); @@ -1785,7 +1547,7 @@ static bool handle_rng_input(int fd, struct device *dev) * it reads straight into the Guest's buffer. We loop to make sure we * fill it. */ while (!iov_empty(iov, in_num)) { - len = readv(dev->fd, iov, in_num); + len = readv(rng_info->rfd, iov, in_num); if (len <= 0) err(1, "Read from /dev/random gave %i", len); iov_consume(iov, in_num, len); @@ -1793,25 +1555,23 @@ static bool handle_rng_input(int fd, struct device *dev) } /* Tell the Guest about the new input. */ - add_used_and_trigger(fd, dev->vq, head, totlen); - - /* Everything went OK! */ - return true; + add_used(vq, head, totlen); } /* And this creates a "hardware" random number device for the Guest. */ static void setup_rng(void) { struct device *dev; - int fd; + struct rng_info *rng_info = malloc(sizeof(*rng_info)); - fd = open_or_die("/dev/random", O_RDONLY); + rng_info->rfd = open_or_die("/dev/random", O_RDONLY); /* The device responds to return from I/O thread. */ - dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); + dev = new_device("rng", VIRTIO_ID_RNG); + dev->priv = rng_info; /* The device has one virtqueue, where the Guest places inbufs. */ - add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); verbose("device %u: rng\n", devices.device_num++); } @@ -1827,17 +1587,18 @@ static void __attribute__((noreturn)) restart_guest(void) for (i = 3; i < FD_SETSIZE; i++) close(i); - /* The exec automatically gets rid of the I/O and Waker threads. */ + /* Reset all the devices (kills all threads). */ + cleanup_devices(); + execv(main_args[0], main_args); err(1, "Could not exec %s", main_args[0]); } /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves * its input and output, and finally, lays it to rest. */ -static void __attribute__((noreturn)) run_guest(int lguest_fd) +static void __attribute__((noreturn)) run_guest(void) { for (;;) { - unsigned long args[] = { LHREQ_BREAK, 0 }; unsigned long notify_addr; int readval; @@ -1848,8 +1609,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) /* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { verbose("Notify on address %#lx\n", notify_addr); - handle_output(lguest_fd, notify_addr); - continue; + handle_output(notify_addr); /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { char reason[1024] = { 0 }; @@ -1858,19 +1618,9 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) /* ERESTART means that we need to reboot the guest */ } else if (errno == ERESTART) { restart_guest(); - /* EAGAIN means a signal (timeout). - * Anything else means a bug or incompatible change. */ - } else if (errno != EAGAIN) + /* Anything else means a bug or incompatible change. */ + } else err(1, "Running guest failed"); - - /* Only service input on thread for CPU 0. */ - if (cpu_id != 0) - continue; - - /* Service input, then unset the BREAK to release the Waker. */ - handle_input(lguest_fd); - if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) - err(1, "Resetting break"); } } /*L:240 @@ -1904,8 +1654,8 @@ int main(int argc, char *argv[]) /* Memory, top-level pagetable, code startpoint and size of the * (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; - /* Two temporaries and the /dev/lguest file descriptor. */ - int i, c, lguest_fd; + /* Two temporaries. */ + int i, c; /* The boot information for the Guest. */ struct boot_params *boot; /* If they specify an initrd file to load. */ @@ -1913,18 +1663,10 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; - /* We don't "wait" for the children, so prevent them from becoming - * zombies. */ - signal(SIGCHLD, SIG_IGN); - /* First we initialize the device list. Since console and network - * device receive input from a file descriptor, we keep an fdset - * (infds) and the maximum fd number (max_infd) with the head of the - * list. We also keep a pointer to the last device. Finally, we keep - * the next interrupt number to use for devices (1: remember that 0 is - * used by the timer). */ - FD_ZERO(&devices.infds); - devices.max_infd = -1; + /* First we initialize the device list. We keep a pointer to the last + * device, and the next interrupt number to use for devices (1: + * remember that 0 is used by the timer). */ devices.lastdev = NULL; devices.next_irq = 1; @@ -1982,9 +1724,6 @@ int main(int argc, char *argv[]) /* We always have a console device */ setup_console(); - /* We can timeout waiting for Guest network transmit. */ - setup_timeout(); - /* Now we load the kernel */ start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); @@ -2023,15 +1762,16 @@ int main(int argc, char *argv[]) /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - lguest_fd = tell_kernel(start); + tell_kernel(start); + + /* Ensure that we terminate if a child dies. */ + signal(SIGCHLD, kill_launcher); - /* We clone off a thread, which wakes the Launcher whenever one of the - * input file descriptors needs attention. We call this the Waker, and - * we'll cover it in a moment. */ - setup_waker(lguest_fd); + /* If we exit via err(), this kills all the threads, restores tty. */ + atexit(cleanup_devices); /* Finally, run the Guest. This doesn't return. */ - run_guest(lguest_fd); + run_guest(); } /*:*/ diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f95..efb3a6a045a2 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -37,7 +37,6 @@ Running Lguest: "Paravirtualized guest support" = Y "Lguest guest support" = Y "High Memory Support" = off/4GB - "PAE (Physical Address Extension) Support" = N "Alignment value to which kernel should be aligned" = 0x100000 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and CONFIG_PHYSICAL_ALIGN=0x100000) diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9c..313389cd50d2 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,13 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M for ease of mapping into the guest (one PTE page). */ +/* We map at -4M (-2M when PAE is activated) for ease of mapping + * into the guest (one PTE page). */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_ADDR 0xFFE00000 +#else #define SWITCHER_ADDR 0xFFC00000 +#endif /* Found in switcher.S */ extern unsigned long default_idt_entries[]; diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487b..d31c4a684078 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -12,11 +12,13 @@ #define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 +#define LHCALL_SET_PMD 13 #define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 +#define LHCALL_SET_PGD 15 #define LHCALL_LOAD_TLS 16 #define LHCALL_NOTIFY 17 #define LHCALL_LOAD_GDT_ENTRY 18 +#define LHCALL_SEND_INTERRUPTS 19 #define LGUEST_TRAP_ENTRY 0x1F @@ -32,10 +34,10 @@ * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Eighteen hypercalls are + * We use the KVM hypercall mechanism. Seventeen hypercalls are * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx and %edx. If a return - * value makes sense, it's returned in %eax. + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's @@ -47,8 +49,9 @@ #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ - unsigned long arg0, arg1, arg2, arg3; + /* These map directly onto eax, ebx, ecx, edx and esi + * in struct lguest_regs */ + unsigned long arg0, arg1, arg2, arg3, arg4; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd7015..dfdbf6403895 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -126,6 +126,7 @@ void foo(void) #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); BLANK(); diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d3..38718041efc3 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -2,7 +2,6 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT depends on X86_32 - depends on !X86_PAE select VIRTIO select VIRTIO_RING select VIRTIO_CONSOLE diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c26559395..7bc65f0f62c4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we - * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall + * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * @@ -96,7 +96,8 @@ struct lguest_data lguest_data = { * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */ static void async_hcall(unsigned long call, unsigned long arg1, - unsigned long arg2, unsigned long arg3) + unsigned long arg2, unsigned long arg3, + unsigned long arg4) { /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; @@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall3(call, arg1, arg2, arg3); + kvm_hypercall4(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; + lguest_data.hcalls[next_call].arg4 = arg4; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall1(call, arg1); else - async_hcall(call, arg1, 0, 0); + async_hcall(call, arg1, 0, 0, 0); } static void lazy_hcall2(unsigned long call, @@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall2(call, arg1, arg2); else - async_hcall(call, arg1, arg2, 0); + async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, @@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call, if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) kvm_hypercall3(call, arg1, arg2, arg3); else - async_hcall(call, arg1, arg2, arg3); + async_hcall(call, arg1, arg2, arg3, 0); } +#ifdef CONFIG_X86_PAE +static void lazy_hcall4(unsigned long call, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) +{ + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) + kvm_hypercall4(call, arg1, arg2, arg3, arg4); + else + async_hcall(call, arg1, arg2, arg3, arg4); +} +#endif + /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mmu_mode(void) @@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next) paravirt_end_context_switch(next); } -/*G:033 +/*G:032 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * @@ -199,30 +215,28 @@ static unsigned long save_fl(void) { return lguest_data.irq_enabled; } -PV_CALLEE_SAVE_REGS_THUNK(save_fl); - -/* restore_flags() just sets the flags back to the value given. */ -static void restore_fl(unsigned long flags) -{ - lguest_data.irq_enabled = flags; -} -PV_CALLEE_SAVE_REGS_THUNK(restore_fl); /* Interrupts go off... */ static void irq_disable(void) { lguest_data.irq_enabled = 0; } + +/* Let's pause a moment. Remember how I said these are called so often? + * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to + * break some rules. In particular, these functions are assumed to save their + * own registers if they need to: normal C functions assume they can trash the + * eax register. To use normal C functions, we use + * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the + * C function, then restores it. */ +PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); +/*:*/ -/* Interrupts go on... */ -static void irq_enable(void) -{ - lguest_data.irq_enabled = X86_EFLAGS_IF; -} -PV_CALLEE_SAVE_REGS_THUNK(irq_enable); +/* These are in i386_head.S */ +extern void lg_irq_enable(void); +extern void lg_restore_fl(unsigned long flags); -/*:*/ /*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer @@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ - *dx &= 0x07808111; + /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ + *dx &= 0x07808151; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls @@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, if (*ax > 0x80000008) *ax = 0x80000008; break; + case 0x80000001: + /* Here we should fix nx cap depending on host. */ + /* For this version of PAE, we just clear NX bit. */ + *dx &= ~(1 << 20); + break; } } @@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val) static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { +#ifdef CONFIG_X86_PAE + lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, + ptep->pte_low, ptep->pte_high); +#else lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); +#endif } static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); lguest_pte_update(mm, addr, ptep); } -/* The Guest calls this to set a top-level entry. Again, we set the entry then - * tell the Host which top-level page we changed, and the index of the entry we - * changed. */ +/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd + * to set a middle-level entry when PAE is activated. + * Again, we set the entry then tell the Host which page we changed, + * and the index of the entry we changed. */ +#ifdef CONFIG_X86_PAE +static void lguest_set_pud(pud_t *pudp, pud_t pudval) +{ + native_set_pud(pudp, pudval); + + /* 32 bytes aligned pdpt address and the index. */ + lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, + (__pa(pudp) & 0x1F) / sizeof(pud_t)); +} + static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { - *pmdp = pmdval; + native_set_pmd(pmdp, pmdval); lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, - (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); } +#else + +/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not + * activated. */ +static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) +{ + native_set_pmd(pmdp, pmdval); + lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, + (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); +} +#endif /* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't @@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * which brings boot back to 0.25 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { - *ptep = pteval; + native_set_pte(ptep, pteval); + if (cr3_changed) + lazy_hcall1(LHCALL_FLUSH_TLB, 1); +} + +#ifdef CONFIG_X86_PAE +static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + native_set_pte_atomic(ptep, pte); if (cr3_changed) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } +void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + native_pte_clear(mm, addr, ptep); + lguest_pte_update(mm, addr, ptep); +} + +void lguest_pmd_clear(pmd_t *pmdp) +{ + lguest_set_pmd(pmdp, __pmd(0)); +} +#endif + /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do @@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void) { unsigned int i; - for (i = 0; i < LGUEST_IRQS; i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* Some systems map "vectors" to interrupts weirdly. Lguest has * a straightforward 1 to 1 mapping, so force that here. */ - __get_cpu_var(vector_irq)[vector] = i; - if (vector != SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); + __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ @@ -973,10 +1038,10 @@ static void lguest_restart(char *reason) * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We - * patch the four most commonly called functions: disable interrupts, enable - * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 - * bytes to patch into: the Guest versions of these operations are small enough - * that we can fit comfortably. + * patch two of the simplest of the most commonly called functions: disable + * interrupts and save interrupts. We usually have 6 or 10 bytes to patch + * into: the Guest versions of these operations are small enough that we can + * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in i386_head.S. */ @@ -987,8 +1052,6 @@ static const struct lguest_insns const char *start, *end; } lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, - [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, - [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; @@ -1026,6 +1089,7 @@ __init void lguest_init(void) pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; + pv_info.shared_kernel_pmd = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ @@ -1033,9 +1097,9 @@ __init void lguest_init(void) /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); - pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); - pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ @@ -1071,6 +1135,12 @@ __init void lguest_init(void) pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; + pv_mmu_ops.pte_clear = lguest_pte_clear; + pv_mmu_ops.pmd_clear = lguest_pmd_clear; + pv_mmu_ops.set_pud = lguest_set_pud; +#endif pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f79541989471..a9c8cfe61cd4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -46,10 +46,64 @@ ENTRY(lguest_entry) .globl lgstart_##name; .globl lgend_##name LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) -LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*:*/ + +/*G:033 But using those wrappers is inefficient (we'll see why that doesn't + * matter for save_fl and irq_disable later). If we write our routines + * carefully in assembler, we can avoid clobbering any registers and avoid + * jumping through the wrapper functions. + * + * I skipped over our first piece of assembler, but this one is worth studying + * in a bit more detail so I'll describe in easy stages. First, the routine + * to enable interrupts: */ +ENTRY(lg_irq_enable) + /* The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled + /* But now we need to check if the Host wants to know: there might have + * been interrupts waiting to be delivered, in which case it will have + * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we + * jump to send_interrupts, otherwise we're done. */ + testl $0, lguest_data+LGUEST_DATA_irq_pending + jnz send_interrupts + /* One cool thing about x86 is that you can do many things without using + * a register. In this case, the normal path hasn't needed to save or + * restore any registers at all! */ + ret +send_interrupts: + /* OK, now we need a register: eax is used for the hypercall number, + * which is LHCALL_SEND_INTERRUPTS. + * + * We used not to bother with this pending detection at all, which was + * much simpler. Sooner or later the Host would realize it had to + * send us an interrupt. But that turns out to make performance 7 + * times worse on a simple tcp benchmark. So now we do this the hard + * way. */ + pushl %eax + movl $LHCALL_SEND_INTERRUPTS, %eax + /* This is a vmcall instruction (same thing that KVM uses). Older + * assembler versions might not know the "vmcall" instruction, so we + * create one manually here. */ + .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + popl %eax + ret + +/* Finally, the "popf" or "restore flags" routine. The %eax register holds the + * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're + * enabling interrupts again, if it's 0 we're leaving them off. */ +ENTRY(lg_restore_fl) + /* This is just "lguest_data.irq_enabled = flags;" */ + movl %eax, lguest_data+LGUEST_DATA_irq_enabled + /* Now, if the %eax value has enabled interrupts and + * lguest_data.irq_pending is set, we want to tell the Host so it can + * deliver any outstanding interrupts. Fortunately, both values will + * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" + * instruction will AND them together for us. If both are set, we + * jump to send_interrupts. */ + testl lguest_data+LGUEST_DATA_irq_pending, %eax + jnz send_interrupts + /* Again, the normal path has used no extra registers. Clever, huh? */ + ret /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359a..0aaa0597a622 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,6 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX + depends on X86_32 && EXPERIMENTAL && EVENTFD select HVC_DRIVER ---help--- This is a very simple module which allows you to run diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4845fb3cf74b..a6974e9b8ebf 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -95,7 +95,7 @@ static __init int map_switcher(void) * array of struct pages. It increments that pointer, but we don't * care. */ pagep = switcher_page; - err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); + err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; @@ -188,6 +188,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { + unsigned int irq; + bool more; + /* First we run any hypercalls the Guest wants done. */ if (cpu->hcall) do_hypercalls(cpu); @@ -195,23 +198,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ if (cpu->pending_notify) { - if (put_user(cpu->pending_notify, user)) - return -EFAULT; - return sizeof(cpu->pending_notify); + if (!send_notify_to_eventfd(cpu)) { + if (put_user(cpu->pending_notify, user)) + return -EFAULT; + return sizeof(cpu->pending_notify); + } } /* Check for signals */ if (signal_pending(current)) return -ERESTARTSYS; - /* If Waker set break_out, return to Launcher. */ - if (cpu->break_out) - return -EAGAIN; - /* Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next * run the Guest. */ - maybe_do_interrupt(cpu); + irq = interrupt_pending(cpu, &more); + if (irq < LGUEST_IRQS) + try_deliver_interrupt(cpu, irq, more); /* All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, @@ -224,10 +227,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) break; /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer or LHREQ_BREAK from the Waker will wake us. */ + * clock timer will wake us. */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + /* Just before we sleep, make sure no interrupt snuck in + * which we should be doing. */ + if (interrupt_pending(cpu, &more) < LGUEST_IRQS) + set_current_state(TASK_RUNNING); + else + schedule(); continue; } diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 54d66f05fefa..c29ffa19cb74 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* This call does nothing, except by breaking out of the Guest * it makes us process all the asynchronous hypercalls. */ break; + case LHCALL_SEND_INTERRUPTS: + /* This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. */ + break; case LHCALL_LGUEST_INIT: /* You can't get here unless you're already initialized. Don't * do that. */ @@ -73,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: +#ifdef CONFIG_X86_PAE + guest_set_pte(cpu, args->arg1, args->arg2, + __pte(args->arg3 | (u64)args->arg4 << 32)); +#else guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); +#endif + break; + case LHCALL_SET_PGD: + guest_set_pgd(cpu->lg, args->arg1, args->arg2); break; +#ifdef CONFIG_X86_PAE case LHCALL_SET_PMD: guest_set_pmd(cpu->lg, args->arg1, args->arg2); break; +#endif case LHCALL_SET_CLOCKEVENT: guest_set_clockevent(cpu, args->arg1); break; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 6e99adbe1946..0e9067b0d507 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -128,30 +128,39 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, /*H:205 * Virtual Interrupts. * - * maybe_do_interrupt() gets called before every entry to the Guest, to see if - * we should divert the Guest to running an interrupt handler. */ -void maybe_do_interrupt(struct lg_cpu *cpu) + * interrupt_pending() returns the first pending interrupt which isn't blocked + * by the Guest. It is called before every entry to the Guest, and just before + * we go to sleep when the Guest has halted itself. */ +unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; DECLARE_BITMAP(blk, LGUEST_IRQS); - struct desc_struct *idt; /* If the Guest hasn't even initialized yet, we can do nothing. */ if (!cpu->lg->lguest_data) - return; + return LGUEST_IRQS; /* Take our "irqs_pending" array and remove any interrupts the Guest * wants blocked: the result ends up in "blk". */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) - return; + return LGUEST_IRQS; bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); /* Find the first interrupt. */ irq = find_first_bit(blk, LGUEST_IRQS); - /* None? Nothing to do */ - if (irq >= LGUEST_IRQS) - return; + *more = find_next_bit(blk, LGUEST_IRQS, irq+1); + + return irq; +} + +/* This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). */ +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) +{ + struct desc_struct *idt; + + BUG_ON(irq >= LGUEST_IRQS); /* They may be in the middle of an iret, where they asked us never to * deliver interrupts. */ @@ -170,8 +179,12 @@ void maybe_do_interrupt(struct lg_cpu *cpu) u32 irq_enabled; if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) irq_enabled = 0; - if (!irq_enabled) + if (!irq_enabled) { + /* Make sure they know an IRQ is pending. */ + put_user(X86_EFLAGS_IF, + &cpu->lg->lguest_data->irq_pending); return; + } } /* Look at the IDT entry the Guest gave us for this interrupt. The @@ -194,6 +207,25 @@ void maybe_do_interrupt(struct lg_cpu *cpu) * here is a compromise which means at least it gets updated every * timer interrupt. */ write_timestamp(cpu); + + /* If there are no other interrupts we want to deliver, clear + * the pending flag. */ + if (!more) + put_user(0, &cpu->lg->lguest_data->irq_pending); +} + +/* And this is the routine when we want to set an interrupt for the Guest. */ +void set_interrupt(struct lg_cpu *cpu, unsigned int irq) +{ + /* Next time the Guest runs, the core code will see if it can deliver + * this interrupt. */ + set_bit(irq, cpu->irqs_pending); + + /* Make sure it sees it; it might be asleep (eg. halted), or + * running the Guest right now, in which case kick_process() + * will knock it out. */ + if (!wake_up_process(cpu->tsk)) + kick_process(cpu->tsk); } /*:*/ @@ -510,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); /* Remember the first interrupt is the timer interrupt. */ - set_bit(0, cpu->irqs_pending); - /* If the Guest is actually stopped, we need to wake it up. */ - if (cpu->halted) - wake_up_process(cpu->tsk); + set_interrupt(cpu, 0); return HRTIMER_NORESTART; } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index af92a176697f..d4e8979735cb 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -49,7 +49,7 @@ struct lg_cpu { u32 cr2; int ts; u32 esp1; - u8 ss1; + u16 ss1; /* Bitmap of what has changed: see CHANGED_* above. */ int changed; @@ -71,9 +71,7 @@ struct lg_cpu { /* Virtual clock device */ struct hrtimer hrt; - /* Do we need to stop what we're doing and return to userspace? */ - int break_out; - wait_queue_head_t break_wq; + /* Did the Guest tell us to halt? */ int halted; /* Pending virtual interrupts */ @@ -82,6 +80,16 @@ struct lg_cpu { struct lg_cpu_arch arch; }; +struct lg_eventfd { + unsigned long addr; + struct file *event; +}; + +struct lg_eventfd_map { + unsigned int num; + struct lg_eventfd map[]; +}; + /* The private info the thread maintains about the guest. */ struct lguest { @@ -102,6 +110,8 @@ struct lguest unsigned int stack_pages; u32 tsc_khz; + struct lg_eventfd_map *eventfds; + /* Dead? */ const char *dead; }; @@ -137,9 +147,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); * in the kernel. */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) +#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) +#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ -void maybe_do_interrupt(struct lg_cpu *cpu); +unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); +void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); +void set_interrupt(struct lg_cpu *cpu, unsigned int irq); bool deliver_trap(struct lg_cpu *cpu, unsigned int num); void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, u32 low, u32 hi); @@ -150,6 +164,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state, void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def); void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); +bool send_notify_to_eventfd(struct lg_cpu *cpu); void init_clockdev(struct lg_cpu *cpu); bool check_syscall_vector(struct lguest *lg); int init_interrupts(void); @@ -168,7 +183,10 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); int init_guest_pagetable(struct lguest *lg); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); +void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); +#ifdef CONFIG_X86_PAE void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); +#endif void guest_pagetable_clear_all(struct lg_cpu *cpu); void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b8ee103eed5f..32e297121058 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -7,32 +7,83 @@ #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/sched.h> +#include <linux/eventfd.h> +#include <linux/file.h> #include "lg.h" -/*L:055 When something happens, the Waker process needs a way to stop the - * kernel running the Guest and return to the Launcher. So the Waker writes - * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher - * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release - * the Waker. */ -static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) +bool send_notify_to_eventfd(struct lg_cpu *cpu) { - unsigned long on; + unsigned int i; + struct lg_eventfd_map *map; + + /* lg->eventfds is RCU-protected */ + rcu_read_lock(); + map = rcu_dereference(cpu->lg->eventfds); + for (i = 0; i < map->num; i++) { + if (map->map[i].addr == cpu->pending_notify) { + eventfd_signal(map->map[i].event, 1); + cpu->pending_notify = 0; + break; + } + } + rcu_read_unlock(); + return cpu->pending_notify == 0; +} - /* Fetch whether they're turning break on or off. */ - if (get_user(on, input) != 0) - return -EFAULT; +static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) +{ + struct lg_eventfd_map *new, *old = lg->eventfds; - if (on) { - cpu->break_out = 1; - /* Pop it out of the Guest (may be running on different CPU) */ - wake_up_process(cpu->tsk); - /* Wait for them to reset it */ - return wait_event_interruptible(cpu->break_wq, !cpu->break_out); - } else { - cpu->break_out = 0; - wake_up(&cpu->break_wq); - return 0; + if (!addr) + return -EINVAL; + + /* Replace the old array with the new one, carefully: others can + * be accessing it at the same time */ + new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), + GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* First make identical copy. */ + memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); + new->num = old->num; + + /* Now append new entry. */ + new->map[new->num].addr = addr; + new->map[new->num].event = eventfd_fget(fd); + if (IS_ERR(new->map[new->num].event)) { + kfree(new); + return PTR_ERR(new->map[new->num].event); } + new->num++; + + /* Now put new one in place. */ + rcu_assign_pointer(lg->eventfds, new); + + /* We're not in a big hurry. Wait until noone's looking at old + * version, then delete it. */ + synchronize_rcu(); + kfree(old); + + return 0; +} + +static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) +{ + unsigned long addr, fd; + int err; + + if (get_user(addr, input) != 0) + return -EFAULT; + input++; + if (get_user(fd, input) != 0) + return -EFAULT; + + mutex_lock(&lguest_lock); + err = add_eventfd(lg, addr, fd); + mutex_unlock(&lguest_lock); + + return 0; } /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt @@ -45,9 +96,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return -EFAULT; if (irq >= LGUEST_IRQS) return -EINVAL; - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ - set_bit(irq, cpu->irqs_pending); + + set_interrupt(cpu, irq); return 0; } @@ -126,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) * address. */ lguest_arch_setup_regs(cpu, start_ip); - /* Initialize the queue for the Waker to wait on */ - init_waitqueue_head(&cpu->break_wq); - /* We keep a pointer to the Launcher task (ie. current task) for when * other Guests want to wake this one (eg. console input). */ cpu->tsk = current; @@ -185,6 +232,13 @@ static int initialize(struct file *file, const unsigned long __user *input) goto unlock; } + lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); + if (!lg->eventfds) { + err = -ENOMEM; + goto free_lg; + } + lg->eventfds->num = 0; + /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; @@ -192,7 +246,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) - goto release_guest; + goto free_eventfds; /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can fail. */ @@ -211,7 +265,9 @@ static int initialize(struct file *file, const unsigned long __user *input) free_regs: /* FIXME: This should be in free_vcpu */ free_page(lg->cpus[0].regs_page); -release_guest: +free_eventfds: + kfree(lg->eventfds); +free_lg: kfree(lg); unlock: mutex_unlock(&lguest_lock); @@ -252,11 +308,6 @@ static ssize_t write(struct file *file, const char __user *in, /* Once the Guest is dead, you can only read() why it died. */ if (lg->dead) return -ENOENT; - - /* If you're not the task which owns the Guest, all you can do - * is break the Launcher out of running the Guest. */ - if (current != cpu->tsk && req != LHREQ_BREAK) - return -EPERM; } switch (req) { @@ -264,8 +315,8 @@ static ssize_t write(struct file *file, const char __user *in, return initialize(file, input); case LHREQ_IRQ: return user_send_irq(cpu, input); - case LHREQ_BREAK: - return break_guest_out(cpu, input); + case LHREQ_EVENTFD: + return attach_eventfd(lg, input); default: return -EINVAL; } @@ -303,6 +354,12 @@ static int close(struct inode *inode, struct file *file) * the Launcher's memory management structure. */ mmput(lg->cpus[i].mm); } + + /* Release any eventfds they registered. */ + for (i = 0; i < lg->eventfds->num; i++) + fput(lg->eventfds->map[i].event); + kfree(lg->eventfds); + /* If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). */ if (!IS_ERR(lg->dead)) diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a059cf9980f7..a6fe1abda240 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -53,6 +53,17 @@ * page. */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) +/* For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. */ +#ifdef CONFIG_X86_PAE +#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) +#define RESERVE_MEM 2U +#define CHECK_GPGD_MASK _PAGE_PRESENT +#else +#define RESERVE_MEM 4U +#define CHECK_GPGD_MASK _PAGE_TABLE +#endif + /* We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this * CPU's guest to see the pages of any other CPU. */ @@ -73,24 +84,59 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); +#ifndef CONFIG_X86_PAE /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { kill_guest(cpu, "attempt to access switcher pages"); index = 0; } +#endif /* Return a pointer index'th pgd entry for the i'th page table. */ return &cpu->lg->pgdirs[i].pgdir[index]; } +#ifdef CONFIG_X86_PAE +/* This routine then takes the PGD entry given above, which contains the + * address of the PMD page. It then returns a pointer to the PMD entry for the + * given address. */ +static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) +{ + unsigned int index = pmd_index(vaddr); + pmd_t *page; + + /* We kill any Guest trying to touch the Switcher addresses. */ + if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && + index >= SWITCHER_PMD_INDEX) { + kill_guest(cpu, "attempt to access switcher pages"); + index = 0; + } + + /* You should never call this if the PGD entry wasn't valid */ + BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); + page = __va(pgd_pfn(spgd) << PAGE_SHIFT); + + return &page[index]; +} +#endif + /* This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a * pointer to the PTE entry for the given address. */ -static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { +#ifdef CONFIG_X86_PAE + pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); + pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); + + /* You should never call this if the PMD entry wasn't valid */ + BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); +#else pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); - return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; +#endif + + return &page[pte_index(vaddr)]; } /* These two functions just like the above two, except they access the Guest @@ -101,12 +147,32 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); } -static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) +#ifdef CONFIG_X86_PAE +static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) +{ + unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); + return gpage + pmd_index(vaddr) * sizeof(pmd_t); +} + +static unsigned long gpte_addr(struct lg_cpu *cpu, + pmd_t gpmd, unsigned long vaddr) +{ + unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; + + BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); + return gpage + pte_index(vaddr) * sizeof(pte_t); +} +#else +static unsigned long gpte_addr(struct lg_cpu *cpu, + pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); - return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); + return gpage + pte_index(vaddr) * sizeof(pte_t); } +#endif /*:*/ /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as @@ -171,7 +237,7 @@ static void release_pte(pte_t pte) /* Remember that get_user_pages_fast() took a reference to the page, in * get_pfn()? We have to put it back now. */ if (pte_flags(pte) & _PAGE_PRESENT) - put_page(pfn_to_page(pte_pfn(pte))); + put_page(pte_page(pte)); } /*:*/ @@ -184,11 +250,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) { - if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || + if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) kill_guest(cpu, "bad page directory entry"); } +#ifdef CONFIG_X86_PAE +static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) +{ + if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || + (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) + kill_guest(cpu, "bad page middle directory entry"); +} +#endif + /*H:330 * (i) Looking up a page table entry when the Guest faults. * @@ -207,6 +282,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; +#ifdef CONFIG_X86_PAE + pmd_t *spmd; + pmd_t gpmd; +#endif + /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -228,12 +308,45 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) check_gpgd(cpu, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); + set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + /* middle level not present? We can't map it in. */ + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + return false; + + /* Now look at the matching shadow entry. */ + spmd = spmd_addr(cpu, *spgd, vaddr); + + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { + /* No shadow entry: allocate a new shadow PTE page. */ + unsigned long ptepage = get_zeroed_page(GFP_KERNEL); + + /* This is not really the Guest's fault, but killing it is + * simple for this corner case. */ + if (!ptepage) { + kill_guest(cpu, "out of memory allocating pte page"); + return false; + } + + /* We check that the Guest pmd is OK. */ + check_gpmd(cpu, gpmd); + + /* And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. */ + native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); + } + + /* OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. */ + gpte_ptr = gpte_addr(cpu, gpmd, vaddr); +#else /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ - gpte_ptr = gpte_addr(gpgd, vaddr); + gpte_ptr = gpte_addr(cpu, gpgd, vaddr); +#endif gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -259,7 +372,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ - spte = spte_addr(*spgd, vaddr); + spte = spte_addr(cpu, *spgd, vaddr); /* If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ release_pte(*spte); @@ -273,7 +386,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so * we can update the Guest's _PAGE_DIRTY flag. */ - *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); + native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ @@ -301,14 +414,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) pgd_t *spgd; unsigned long flags; +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif /* Look at the current top level entry: is it present? */ spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return false; +#ifdef CONFIG_X86_PAE + spmd = spmd_addr(cpu, *spgd, vaddr); + if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) + return false; +#endif + /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = pte_flags(*(spte_addr(*spgd, vaddr))); + flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -322,8 +444,43 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) kill_guest(cpu, "bad stack page %#lx", vaddr); } +#ifdef CONFIG_X86_PAE +static void release_pmd(pmd_t *spmd) +{ + /* If the entry's not present, there's nothing to release. */ + if (pmd_flags(*spmd) & _PAGE_PRESENT) { + unsigned int i; + pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); + /* For each entry in the page, we might need to release it. */ + for (i = 0; i < PTRS_PER_PTE; i++) + release_pte(ptepage[i]); + /* Now we can free the page of PTEs */ + free_page((long)ptepage); + /* And zero out the PMD entry so we never release it twice. */ + native_set_pmd(spmd, __pmd(0)); + } +} + +static void release_pgd(pgd_t *spgd) +{ + /* If the entry's not present, there's nothing to release. */ + if (pgd_flags(*spgd) & _PAGE_PRESENT) { + unsigned int i; + pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); + + for (i = 0; i < PTRS_PER_PMD; i++) + release_pmd(&pmdpage[i]); + + /* Now we can free the page of PMDs */ + free_page((long)pmdpage); + /* And zero out the PGD entry so we never release it twice. */ + set_pgd(spgd, __pgd(0)); + } +} + +#else /* !CONFIG_X86_PAE */ /*H:450 If we chase down the release_pgd() code, it looks like this: */ -static void release_pgd(struct lguest *lg, pgd_t *spgd) +static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { @@ -341,7 +498,7 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd) *spgd = __pgd(0); } } - +#endif /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. * It simply releases every PTE page from 0 up to the Guest's kernel address. */ @@ -350,7 +507,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) unsigned int i; /* Release every pgd entry up to the kernel's address. */ for (i = 0; i < pgd_index(lg->kernel_address); i++) - release_pgd(lg, lg->pgdirs[idx].pgdir + i); + release_pgd(lg->pgdirs[idx].pgdir + i); } /*H:440 (v) Flushing (throwing away) page tables, @@ -369,7 +526,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t gpgd; pte_t gpte; - +#ifdef CONFIG_X86_PAE + pmd_t gpmd; +#endif /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ @@ -378,7 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return -1UL; } - gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); +#ifdef CONFIG_X86_PAE + gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + kill_guest(cpu, "Bad address %#lx", vaddr); + gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); +#else + gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); +#endif if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); @@ -405,6 +571,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, int *blank_pgdir) { unsigned int next; +#ifdef CONFIG_X86_PAE + pmd_t *pmd_table; +#endif /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ @@ -416,10 +585,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /* If the allocation fails, just keep using the one we have */ if (!cpu->lg->pgdirs[next].pgdir) next = cpu->cpu_pgd; - else - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + else { +#ifdef CONFIG_X86_PAE + /* In PAE mode, allocate a pmd page and populate the + * last pgd entry. */ + pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd_table) { + free_page((long)cpu->lg->pgdirs[next].pgdir); + set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); + next = cpu->cpu_pgd; + } else { + set_pgd(cpu->lg->pgdirs[next].pgdir + + SWITCHER_PGD_INDEX, + __pgd(__pa(pmd_table) | _PAGE_PRESENT)); + /* This is a blank page, so there are no kernel + * mappings: caller must map the stack! */ + *blank_pgdir = 1; + } +#else *blank_pgdir = 1; +#endif + } } /* Record which Guest toplevel this shadows. */ cpu->lg->pgdirs[next].gpgdir = gpgdir; @@ -431,7 +617,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, /*H:430 (iv) Switching page tables * - * Now we've seen all the page table setting and manipulation, let's see what + * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) @@ -460,10 +646,25 @@ static void release_all_pagetables(struct lguest *lg) /* Every shadow pagetable this Guest has */ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].pgdir) + if (lg->pgdirs[i].pgdir) { +#ifdef CONFIG_X86_PAE + pgd_t *spgd; + pmd_t *pmdpage; + unsigned int k; + + /* Get the last pmd page. */ + spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; + pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); + + /* And release the pmd entries of that pmd page, + * except for the switcher pmd. */ + for (k = 0; k < SWITCHER_PMD_INDEX; k++) + release_pmd(&pmdpage[k]); +#endif /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) - release_pgd(lg, lg->pgdirs[i].pgdir + j); + release_pgd(lg->pgdirs[i].pgdir + j); + } } /* We also throw away everything when a Guest tells us it's changed a kernel @@ -504,24 +705,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, { /* Look up the matching shadow page directory entry. */ pgd_t *spgd = spgd_addr(cpu, idx, vaddr); +#ifdef CONFIG_X86_PAE + pmd_t *spmd; +#endif /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { - /* Otherwise, we start by releasing the existing entry. */ - pte_t *spte = spte_addr(*spgd, vaddr); - release_pte(*spte); - - /* If they're setting this entry as dirty or accessed, we might - * as well put that entry they've given us in now. This shaves - * 10% off a copy-on-write micro-benchmark. */ - if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { - check_gpte(cpu, gpte); - *spte = gpte_to_spte(cpu, gpte, - pte_flags(gpte) & _PAGE_DIRTY); - } else - /* Otherwise kill it and we can demand_page() it in - * later. */ - *spte = __pte(0); +#ifdef CONFIG_X86_PAE + spmd = spmd_addr(cpu, *spgd, vaddr); + if (pmd_flags(*spmd) & _PAGE_PRESENT) { +#endif + /* Otherwise, we start by releasing + * the existing entry. */ + pte_t *spte = spte_addr(cpu, *spgd, vaddr); + release_pte(*spte); + + /* If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us + * in now. This shaves 10% off a + * copy-on-write micro-benchmark. */ + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + check_gpte(cpu, gpte); + native_set_pte(spte, + gpte_to_spte(cpu, gpte, + pte_flags(gpte) & _PAGE_DIRTY)); + } else + /* Otherwise kill it and we can demand_page() + * it in later. */ + native_set_pte(spte, __pte(0)); +#ifdef CONFIG_X86_PAE + } +#endif } } @@ -568,12 +782,10 @@ void guest_set_pte(struct lg_cpu *cpu, * * So with that in mind here's our code to to update a (top-level) PGD entry: */ -void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) +void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; - /* The kernel seems to try to initialize this early on: we ignore its - * attempts to map over the Switcher. */ if (idx >= SWITCHER_PGD_INDEX) return; @@ -581,8 +793,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) pgdir = find_pgdir(lg, gpgdir); if (pgdir < ARRAY_SIZE(lg->pgdirs)) /* ... throw it away. */ - release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); + release_pgd(lg->pgdirs[pgdir].pgdir + idx); } +#ifdef CONFIG_X86_PAE +void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) +{ + guest_pagetable_clear_all(&lg->cpus[0]); +} +#endif /* Once we know how much memory we have we can construct simple identity * (which set virtual == physical) and linear mappings @@ -596,8 +814,16 @@ static unsigned long setup_pagetables(struct lguest *lg, { pgd_t __user *pgdir; pte_t __user *linear; - unsigned int mapped_pages, i, linear_pages, phys_linear; unsigned long mem_base = (unsigned long)lg->mem_base; + unsigned int mapped_pages, i, linear_pages; +#ifdef CONFIG_X86_PAE + pmd_t __user *pmds; + unsigned int j; + pgd_t pgd; + pmd_t pmd; +#else + unsigned int phys_linear; +#endif /* We have mapped_pages frames to map, so we need * linear_pages page tables to map them. */ @@ -610,6 +836,9 @@ static unsigned long setup_pagetables(struct lguest *lg, /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages * PAGE_SIZE; +#ifdef CONFIG_X86_PAE + pmds = (void *)linear - PAGE_SIZE; +#endif /* Linear mapping is easy: put every page's address into the * mapping in order. */ for (i = 0; i < mapped_pages; i++) { @@ -621,6 +850,22 @@ static unsigned long setup_pagetables(struct lguest *lg, /* The top level points to the linear page table pages above. * We setup the identity and linear mappings here. */ +#ifdef CONFIG_X86_PAE + for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; + i += PTRS_PER_PTE, j++) { + native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) + - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + + if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) + return -EFAULT; + } + + set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) + return -EFAULT; + if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) + return -EFAULT; +#else phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; @@ -633,6 +878,7 @@ static unsigned long setup_pagetables(struct lguest *lg, &pgd, sizeof(pgd))) return -EFAULT; } +#endif /* We return the top level (guest-physical) address: remember where * this is. */ @@ -648,7 +894,10 @@ int init_guest_pagetable(struct lguest *lg) u64 mem; u32 initrd_size; struct boot_params __user *boot = (struct boot_params *)lg->mem_base; - +#ifdef CONFIG_X86_PAE + pgd_t *pgd; + pmd_t *pmd_table; +#endif /* Get the Guest memory size and the ramdisk size from the boot header * located at lg->mem_base (Guest address 0). */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) @@ -663,6 +912,15 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; +#ifdef CONFIG_X86_PAE + pgd = lg->pgdirs[0].pgdir; + pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); + if (!pmd_table) + return -ENOMEM; + + set_pgd(pgd + SWITCHER_PGD_INDEX, + __pgd(__pa(pmd_table) | _PAGE_PRESENT)); +#endif lg->cpus[0].cpu_pgd = 0; return 0; } @@ -672,17 +930,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, - &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 4MB of virtual - * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) - || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) + &cpu->lg->lguest_data->kernel_address) + /* We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. */ + || put_user(RESERVE_MEM * 1024 * 1024, + &cpu->lg->lguest_data->reserve_mem) + || put_user(cpu->lg->pgdirs[0].gpgdir, + &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ +#ifdef CONFIG_X86_PAE + if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && + pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) +#else if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) +#endif kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); } @@ -708,16 +973,30 @@ void free_guest_pagetable(struct lguest *lg) void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); - pgd_t switcher_pgd; pte_t regs_pte; unsigned long pfn; +#ifdef CONFIG_X86_PAE + pmd_t switcher_pmd; + pmd_t *pmd_table; + + native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> + PAGE_SHIFT, PAGE_KERNEL_EXEC)); + + pmd_table = __va(pgd_pfn(cpu->lg-> + pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) + << PAGE_SHIFT); + native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); +#else + pgd_t switcher_pgd; + /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ - switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); + switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; +#endif /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher @@ -726,8 +1005,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) * page is already mapped there, we don't have to copy them out * again. */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; - regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); - switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; + native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); + native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], + regs_pte); } /*:*/ @@ -752,21 +1032,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - pte[i] = mk_pte(switcher_page[i], - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); + native_set_pte(&pte[i], mk_pte(switcher_page[i], + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } /* The only other thing we map is this CPU's pair of pages. */ i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); + native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); /* The second page contains the "struct lguest_ro_state", and is * read-only. */ - pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), - __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); + native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } /* We've made it through the page table code. Perhaps our tired brains are diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 7ede64ffeef9..482ed5a18750 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { /* We assume the Guest has the same number of GDT entries as the * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ - if (num > ARRAY_SIZE(cpu->arch.gdt)) + if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); /* Set it up, then fix it. */ diff --git a/fs/eventfd.c b/fs/eventfd.c index 2a701d593d35..3f0e1974abdc 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -16,6 +16,7 @@ #include <linux/anon_inodes.h> #include <linux/eventfd.h> #include <linux/syscalls.h> +#include <linux/module.h> struct eventfd_ctx { wait_queue_head_t wqh; @@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n) return n; } +EXPORT_SYMBOL_GPL(eventfd_signal); static int eventfd_release(struct inode *inode, struct file *file) { @@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd) return file; } +EXPORT_SYMBOL_GPL(eventfd_fget); SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 175e63f4a8c0..7bc1440fc473 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -30,6 +30,10 @@ struct lguest_data /* Wallclock time set by the Host. */ struct timespec time; + /* Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + int irq_pending; + /* Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. * This batching can be quite efficient. */ diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index a53407a4165c..bfefbdf7498a 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -57,7 +57,8 @@ enum lguest_req LHREQ_INITIALIZE, /* + base, pfnlimit, start */ LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ - LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ + LHREQ_BREAK, /* No longer used */ + LHREQ_EVENTFD, /* + address, fd. */ }; /* The alignment to use between consumer and producer parts of vring. diff --git a/kernel/sched.c b/kernel/sched.c index f04aa9664504..8ec9d13140be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p) smp_send_reschedule(cpu); preempt_enable(); } +EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu weighted |