diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-06-28 16:01:14 +0200 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-06-28 16:01:14 +0200 |
commit | f35546e072a7a86ccb950d4d1508879b0d49e374 (patch) | |
tree | 7e581edae814482ac4cfb00b9aea2e76ebe05f6d /include/xen/interface | |
parent | 36f988e978f81ffa415df4d77bbcd8887917f25c (diff) | |
parent | 1e0f7a21b2fffc70f27cc4a454c60321501045b1 (diff) | |
download | lwn-f35546e072a7a86ccb950d4d1508879b0d49e374.tar.gz lwn-f35546e072a7a86ccb950d4d1508879b0d49e374.zip |
Merge branch 'stable/for-jens-3.10' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-3.11/drivers
Konrad writes:
It has the 'feature-max-indirect-segments' implemented in both backend
and frontend. The current problem with the backend and frontend is that the
segment size is limited to 11 pages. It means we can at most squeeze in 44kB per
request. The ring can hold 32 (next power of two below 36) requests, meaning we
can do 1.4M of outstanding requests. Nowadays that is not enough.
The problem in the past was addressed in two ways - but neither one went upstream.
The first solution to this proposed by Justin from Spectralogic was to negotiate
the segment size. This means that the ‘struct blkif_sring_entry’ is now a variable size.
It can expand from 112 bytes (cover 11 pages of data - 44kB) to 1580 bytes
(256 pages of data - so 1MB). It is a simple extension by just making the array in the
request expand from 11 to a variable size negotiated. But it had limits: this extension
still limits the number of segments per request to 255 (as the total number must be
specified in the request, which only has an 8-bit field for that purpose).
The other solution (from Intel - Ronghui) was to create one extra ring that only has the
‘struct blkif_request_segment’ in them. The ‘struct blkif_request’ would be changed to have
an index in said ‘segment ring’. There is only one segment ring. This means that the size of
the initial ring is still the same. The requests would point to the segment and enumerate out
how many of the indexes it wants to use. The limit is of course the size of the segment.
If one assumes a one-page segment this means we can in one request cover ~4MB.
Those patches were posted as RFC and the author never followed up on the ideas on changing
it to be a bit more flexible.
There is yet another mechanism that could be employed (which these patches implement) - and it
borrows from VirtIO protocol. And that is the ‘indirect descriptors’. This very similar to
what Intel suggests, but with a twist. The twist is to negotiate how many of these
'segment' pages (aka indirect descriptor pages) we want to support (in reality we negotiate
how many entries in the segment we want to cover, and we module the number if it is
bigger than the segment size).
This means that with the existing 36 slots in the ring (single page) we can cover:
32 slots * each blkif_request_indirect covers: 512 * 4096 ~= 64M. Since we ample space
in the blkif_request_indirect to span more than one indirect page, that number (64M)
can be also multiplied by eight = 512MB.
Roger Pau Monne took the idea and implemented them in these patches. They work
great and the corner cases (migration between backends with and without this extension)
work nicely. The backend has a limit right now off how many indirect entries
it can handle: one indirect page, and at maximum 256 entries (out of 512 - so 50% of the page
is used). That comes out to 32 slots * 256 entries in a indirect page * 1 indirect page
per request * 4096 = 32MB.
This is a conservative number that can change in the future. Right now it strikes
a good balance between giving excellent performance, memory usage in the backend, and
balancing the needs of many guests.
In the patchset there is also the split of the blkback structure to be per-VBD.
This means that the spinlock contention we had with many guests trying to do I/O and
all the blkback threads hitting the same lock has been eliminated.
Also there are bug-fixes to deal with oddly sized sectors, insane amounts on
th ring, and also a security fix (posted earlier).
Diffstat (limited to 'include/xen/interface')
-rw-r--r-- | include/xen/interface/io/blkif.h | 53 | ||||
-rw-r--r-- | include/xen/interface/io/ring.h | 5 |
2 files changed, 58 insertions, 0 deletions
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index ffd4652de91c..65e12099ef89 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -103,12 +103,46 @@ typedef uint64_t blkif_sector_t; #define BLKIF_OP_DISCARD 5 /* + * Recognized if "feature-max-indirect-segments" in present in the backend + * xenbus info. The "feature-max-indirect-segments" node contains the maximum + * number of segments allowed by the backend per request. If the node is + * present, the frontend might use blkif_request_indirect structs in order to + * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The + * maximum number of indirect segments is fixed by the backend, but the + * frontend can issue requests with any number of indirect segments as long as + * it's less than the number provided by the backend. The indirect_grefs field + * in blkif_request_indirect should be filled by the frontend with the + * grant references of the pages that are holding the indirect segments. + * This pages are filled with an array of blkif_request_segment_aligned + * that hold the information about the segments. The number of indirect + * pages to use is determined by the maximum number of segments + * a indirect request contains. Every indirect page can contain a maximum + * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)), + * so to calculate the number of indirect pages to use we have to do + * ceil(indirect_segments/512). + * + * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not* + * create the "feature-max-indirect-segments" node! + */ +#define BLKIF_OP_INDIRECT 6 + +/* * Maximum scatter/gather segments per request. * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. * NB. This could be 12 if the ring indexes weren't stored in the same page. */ #define BLKIF_MAX_SEGMENTS_PER_REQUEST 11 +#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8 + +struct blkif_request_segment_aligned { + grant_ref_t gref; /* reference to I/O buffer frame */ + /* @first_sect: first sector in frame to transfer (inclusive). */ + /* @last_sect: last sector in frame to transfer (inclusive). */ + uint8_t first_sect, last_sect; + uint16_t _pad; /* padding to make it 8 bytes, so it's cache-aligned */ +} __attribute__((__packed__)); + struct blkif_request_rw { uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ @@ -147,12 +181,31 @@ struct blkif_request_other { uint64_t id; /* private guest value, echoed in resp */ } __attribute__((__packed__)); +struct blkif_request_indirect { + uint8_t indirect_op; + uint16_t nr_segments; +#ifdef CONFIG_X86_64 + uint32_t _pad1; /* offsetof(blkif_...,u.indirect.id) == 8 */ +#endif + uint64_t id; + blkif_sector_t sector_number; + blkif_vdev_t handle; + uint16_t _pad2; + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; +#ifdef CONFIG_X86_64 + uint32_t _pad3; /* make it 64 byte aligned */ +#else + uint64_t _pad3; /* make it 64 byte aligned */ +#endif +} __attribute__((__packed__)); + struct blkif_request { uint8_t operation; /* BLKIF_OP_??? */ union { struct blkif_request_rw rw; struct blkif_request_discard discard; struct blkif_request_other other; + struct blkif_request_indirect indirect; } u; } __attribute__((__packed__)); diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 75271b9a8f61..7d28aff605c7 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -188,6 +188,11 @@ struct __name##_back_ring { \ #define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) +/* Ill-behaved frontend determination: Can there be this many requests? */ +#define RING_REQUEST_PROD_OVERFLOW(_r, _prod) \ + (((_prod) - (_r)->rsp_prod_pvt) > RING_SIZE(_r)) + + #define RING_PUSH_REQUESTS(_r) do { \ wmb(); /* back sees requests /before/ updated producer index */ \ (_r)->sring->req_prod = (_r)->req_prod_pvt; \ |