diff options
Diffstat (limited to 'kernel/trace/ring_buffer.c')
| -rw-r--r-- | kernel/trace/ring_buffer.c | 1252 |
1 files changed, 862 insertions, 390 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb6089c2951e..7b07d2004cc6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,7 +4,10 @@ * * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> */ +#include <linux/ring_buffer_types.h> +#include <linux/sched/isolation.h> #include <linux/trace_recursion.h> +#include <linux/panic_notifier.h> #include <linux/trace_events.h> #include <linux/ring_buffer.h> #include <linux/trace_clock.h> @@ -29,8 +32,10 @@ #include <linux/oom.h> #include <linux/mm.h> +#include <asm/ring_buffer.h> #include <asm/local64.h> #include <asm/local.h> +#include <asm/setup.h> #include "trace.h" @@ -48,9 +53,12 @@ static void update_pages_handler(struct work_struct *work); struct ring_buffer_meta { int magic; - int struct_size; - unsigned long text_addr; - unsigned long data_addr; + int struct_sizes; + unsigned long total_size; + unsigned long buffers_offset; +}; + +struct ring_buffer_cpu_meta { unsigned long first_buffer; unsigned long head_buffer; unsigned long commit_buffer; @@ -152,23 +160,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -311,10 +302,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -333,12 +320,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -398,11 +379,38 @@ static void free_buffer_page(struct buffer_page *bpage) } /* - * We need to fit the time_stamp delta into 27 bits. + * For best performance, allocate cpu buffer data cache line sized + * and per CPU. */ -static inline bool test_time_stamp(u64 delta) +#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \ + kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +#define alloc_cpu_page(cpu) (struct buffer_page *) \ + kzalloc_node(ALIGN(sizeof(struct buffer_page), \ + cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); + +static struct buffer_data_page *alloc_cpu_data(int cpu, int order) { - return !!(delta & TS_DELTA_TEST); + struct buffer_data_page *dpage; + struct page *page; + gfp_t mflags; + + /* + * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails + * gracefully without invoking oom-killer and the system is not + * destabilized. + */ + mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO; + + page = alloc_pages_node(cpu_to_node(cpu), mflags, order); + if (!page) + return NULL; + + dpage = page_address(page); + rb_init_page(dpage); + + return dpage; } struct rb_irq_work { @@ -515,9 +523,11 @@ struct ring_buffer_per_cpu { unsigned int mapped; unsigned int user_mapped; /* user space mapping */ struct mutex mapping_lock; - unsigned long *subbuf_ids; /* ID to subbuf VA */ + struct buffer_page **subbuf_ids; /* ID to subbuf VA */ struct trace_buffer_meta *meta_page; - struct ring_buffer_meta *ring_meta; + struct ring_buffer_cpu_meta *ring_meta; + + struct ring_buffer_remote *remote; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -541,6 +551,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -549,9 +561,9 @@ struct trace_buffer { unsigned long range_addr_start; unsigned long range_addr_end; + struct notifier_block flush_nb; - long last_text_delta; - long last_data_delta; + struct ring_buffer_meta *meta; unsigned int subbuf_size; unsigned int subbuf_order; @@ -588,16 +600,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq (unsigned int)sizeof(field.commit), (unsigned int)is_signed_type(long)); - trace_seq_printf(s, "\tfield: int overwrite;\t" + trace_seq_printf(s, "\tfield: char overwrite;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), commit), 1, - (unsigned int)is_signed_type(long)); + (unsigned int)is_signed_type(char)); trace_seq_printf(s, "\tfield: char data;\t" "offset:%u;\tsize:%u;\tsigned:%u;\n", (unsigned int)offsetof(typeof(field), data), - (unsigned int)buffer->subbuf_size, + (unsigned int)(buffer ? buffer->subbuf_size : + PAGE_SIZE - BUF_PAGE_HDR_SIZE), (unsigned int)is_signed_type(char)); return !trace_seq_has_overflowed(s); @@ -1271,7 +1284,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) rb_set_list_to_head(head->list.prev); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->head_buffer = (unsigned long)head->page; } } @@ -1355,6 +1368,13 @@ static inline void rb_inc_page(struct buffer_page **bpage) *bpage = list_entry(p, struct buffer_page, list); } +static inline void rb_dec_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.prev); + + *bpage = list_entry(p, struct buffer_page, list); +} + static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1569,7 +1589,7 @@ out_locked: static unsigned long rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) { - addr += sizeof(struct ring_buffer_meta) + + addr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_subbufs; return ALIGN(addr, subbuf_size); } @@ -1580,19 +1600,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs) static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) { int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE; - unsigned long ptr = buffer->range_addr_start; - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; + struct ring_buffer_meta *bmeta; + unsigned long ptr; int nr_subbufs; - if (!ptr) + bmeta = buffer->meta; + if (!bmeta) return NULL; + ptr = (unsigned long)bmeta + bmeta->buffers_offset; + meta = (struct ring_buffer_cpu_meta *)ptr; + /* When nr_pages passed in is zero, the first meta has already been initialized */ if (!nr_pages) { - meta = (struct ring_buffer_meta *)ptr; nr_subbufs = meta->nr_subbufs; } else { - meta = NULL; /* Include the reader page */ nr_subbufs = nr_pages + 1; } @@ -1624,7 +1647,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu) } /* Return the start of subbufs given the meta pointer */ -static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) +static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta) { int subbuf_size = meta->subbuf_size; unsigned long ptr; @@ -1640,7 +1663,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta) */ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long ptr; int subbuf_size; @@ -1666,14 +1689,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx) } /* + * See if the existing memory contains a valid meta section. + * if so, use that, otherwise initialize it. + */ +static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size) +{ + unsigned long ptr = buffer->range_addr_start; + struct ring_buffer_meta *bmeta; + unsigned long total_size; + int struct_sizes; + + bmeta = (struct ring_buffer_meta *)ptr; + buffer->meta = bmeta; + + total_size = buffer->range_addr_end - buffer->range_addr_start; + + struct_sizes = sizeof(struct ring_buffer_cpu_meta); + struct_sizes |= sizeof(*bmeta) << 16; + + /* The first buffer will start word size after the meta page */ + ptr += sizeof(*bmeta); + ptr = ALIGN(ptr, sizeof(long)); + ptr += scratch_size; + + if (bmeta->magic != RING_BUFFER_META_MAGIC) { + pr_info("Ring buffer boot meta mismatch of magic\n"); + goto init; + } + + if (bmeta->struct_sizes != struct_sizes) { + pr_info("Ring buffer boot meta mismatch of struct size\n"); + goto init; + } + + if (bmeta->total_size != total_size) { + pr_info("Ring buffer boot meta mismatch of total size\n"); + goto init; + } + + if (bmeta->buffers_offset > bmeta->total_size) { + pr_info("Ring buffer boot meta mismatch of offset outside of total size\n"); + goto init; + } + + if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) { + pr_info("Ring buffer boot meta mismatch of first buffer offset\n"); + goto init; + } + + return true; + + init: + bmeta->magic = RING_BUFFER_META_MAGIC; + bmeta->struct_sizes = struct_sizes; + bmeta->total_size = total_size; + bmeta->buffers_offset = (void *)ptr - (void *)bmeta; + + /* Zero out the scratch pad */ + memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta)); + + return false; +} + +/* * See if the existing memory contains valid ring buffer data. * As the previous kernel must be the same as this kernel, all * the calculations (size of buffers and number of buffers) * must be the same. */ -static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, - struct trace_buffer *buffer, int nr_pages, - unsigned long *subbuf_mask) +static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu, + struct trace_buffer *buffer, int nr_pages, + unsigned long *subbuf_mask) { int subbuf_size = PAGE_SIZE; struct buffer_data_page *subbuf; @@ -1684,20 +1770,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, if (!subbuf_mask) return false; - /* Check the meta magic and meta struct size */ - if (meta->magic != RING_BUFFER_META_MAGIC || - meta->struct_size != sizeof(*meta)) { - pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu); - return false; - } - - /* The subbuffer's size and number of subbuffers must match */ - if (meta->subbuf_size != subbuf_size || - meta->nr_subbufs != nr_pages + 1) { - pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu); - return false; - } - buffers_start = meta->first_buffer; buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs); @@ -1743,7 +1815,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu, return true; } -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf); +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf); static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu, unsigned long long *timestamp, u64 *delta_ptr) @@ -1751,6 +1823,7 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu struct ring_buffer_event *event; u64 ts, delta; int events = 0; + int len; int e; *delta_ptr = 0; @@ -1758,9 +1831,12 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu ts = dpage->time_stamp; - for (e = 0; e < tail; e += rb_event_length(event)) { + for (e = 0; e < tail; e += len) { event = (struct ring_buffer_event *)(dpage->data + e); + len = rb_event_length(event); + if (len <= 0 || len > tail - e) + return -1; switch (event->type_len) { @@ -1810,38 +1886,133 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) /* If the meta data has been validated, now validate the events */ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; + struct buffer_page *head_page, *orig_head, *orig_reader; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; + u64 ts; int i; if (!meta || !meta->head_buffer) return; + orig_head = head_page = cpu_buffer->head_page; + orig_reader = cpu_buffer->reader_page; + /* Do the reader page first */ - ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu); + ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu); if (ret < 0) { pr_info("Ring buffer reader page is invalid\n"); goto invalid; } entries += ret; - entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); - local_set(&cpu_buffer->reader_page->entries, ret); + entry_bytes += local_read(&orig_reader->page->commit); + local_set(&orig_reader->entries, ret); - head_page = cpu_buffer->head_page; + ts = head_page->page->time_stamp; + + /* + * Try to rewind the head so that we can read the pages which already + * read in the previous boot. + */ + if (head_page == cpu_buffer->tail_page) + goto skip_rewind; + + rb_dec_page(&head_page); + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { + + /* Rewind until tail (writer) page. */ + if (head_page == cpu_buffer->tail_page) + break; + + /* Ensure the page has older data than head. */ + if (ts < head_page->page->time_stamp) + break; + + ts = head_page->page->time_stamp; + /* Ensure the page has correct timestamp and some data. */ + if (!ts || rb_page_commit(head_page) == 0) + break; + + /* Stop rewind if the page is invalid. */ + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) + break; + + /* Recover the number of entries and update stats. */ + local_set(&head_page->entries, ret); + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; + entry_bytes += rb_page_commit(head_page); + } + if (i) + pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); + + /* The last rewound page must be skipped. */ + if (head_page != orig_head) + rb_inc_page(&head_page); - /* If both the head and commit are on the reader_page then we are done. */ - if (head_page == cpu_buffer->reader_page && - head_page == cpu_buffer->commit_page) + /* + * If the ring buffer was rewound, then inject the reader page + * into the location just before the original head page. + */ + if (head_page != orig_head) { + struct buffer_page *bpage = orig_head; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } + + /* We'll restart verifying from orig_head */ + head_page = orig_head; + } + + skip_rewind: + /* If the commit_buffer is the reader page, update the commit page */ + if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { + cpu_buffer->commit_page = cpu_buffer->reader_page; + /* Nothing more to do, the only page is the reader page */ goto done; + } /* Iterate until finding the commit page */ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) { - /* Reader page has already been done */ - if (head_page == cpu_buffer->reader_page) + /* The original reader page has already been checked/counted. */ + if (head_page == orig_reader) continue; ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); @@ -1857,7 +2028,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entries += ret; entry_bytes += local_read(&head_page->page->commit); - local_set(&cpu_buffer->head_page->entries, ret); + local_set(&head_page->entries, ret); if (head_page == cpu_buffer->commit_page) break; @@ -1891,24 +2062,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) } } -/* Used to calculate data delta */ -static char rb_data_ptr[] = ""; - -#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr) -#define THIS_DATA_PTR ((unsigned long)rb_data_ptr) - -static void rb_meta_init_text_addr(struct ring_buffer_meta *meta) -{ - meta->text_addr = THIS_TEXT_PTR; - meta->data_addr = THIS_DATA_PTR; -} - -static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) +static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size) { - struct ring_buffer_meta *meta; + struct ring_buffer_cpu_meta *meta; unsigned long *subbuf_mask; unsigned long delta; void *subbuf; + bool valid = false; int cpu; int i; @@ -1916,20 +2076,21 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL); /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */ + if (rb_meta_init(buffer, scratch_size)) + valid = true; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *next_meta; meta = rb_range_meta(buffer, nr_pages, cpu); - if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { + if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) { /* Make the mappings match the current address */ subbuf = rb_subbufs_from_meta(meta); delta = (unsigned long)subbuf - meta->first_buffer; meta->first_buffer += delta; meta->head_buffer += delta; meta->commit_buffer += delta; - buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr; - buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr; continue; } @@ -1940,16 +2101,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) memset(meta, 0, next_meta - (void *)meta); - meta->magic = RING_BUFFER_META_MAGIC; - meta->struct_size = sizeof(*meta); - meta->nr_subbufs = nr_pages + 1; meta->subbuf_size = PAGE_SIZE; subbuf = rb_subbufs_from_meta(meta); meta->first_buffer = (unsigned long)subbuf; - rb_meta_init_text_addr(meta); /* * The buffers[] array holds the order of the sub-buffers @@ -1971,7 +2128,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages) static void *rbm_start(struct seq_file *m, loff_t *pos) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val; if (!meta) @@ -1996,7 +2153,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos) static int rbm_show(struct seq_file *m, void *v) { struct ring_buffer_per_cpu *cpu_buffer = m->private; - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long val = (unsigned long)v; if (val == 1) { @@ -2045,7 +2202,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *bpage) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; if (meta->head_buffer == (unsigned long)bpage->page) cpu_buffer->head_page = bpage; @@ -2056,14 +2213,48 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id) +{ + return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { struct trace_buffer *buffer = cpu_buffer->buffer; - struct ring_buffer_meta *meta = NULL; + struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; - gfp_t mflags; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2078,13 +2269,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, return -ENOMEM; /* - * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails - * gracefully without invoking oom-killer and the system is not - * destabilized. - */ - mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL; - - /* * If a user thread allocates too much, and si_mem_available() * reports there's enough memory, even though there is not. * Make sure the OOM killer kills this thread. This can happen @@ -2099,11 +2283,15 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { - struct page *page; - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu_buffer->cpu)); + bpage = alloc_cpu_page(cpu_buffer->cpu); if (!bpage) goto free_pages; @@ -2125,14 +2313,21 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), - mflags | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); + if (!bpage->page) goto free_pages; - bpage->page = page_address(page); - rb_init_page(bpage->page); } bpage->order = cpu_buffer->buffer->subbuf_order; @@ -2183,14 +2378,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_per_cpu * rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; + struct ring_buffer_per_cpu *cpu_buffer __free(kfree) = + alloc_cpu_buffer(cpu); + struct ring_buffer_cpu_meta *meta; struct buffer_page *bpage; - struct page *page; int ret; - cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); if (!cpu_buffer) return NULL; @@ -2206,10 +2399,9 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) init_waitqueue_head(&cpu_buffer->irq_work.full_waiters); mutex_init(&cpu_buffer->mapping_lock); - bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - GFP_KERNEL, cpu_to_node(cpu)); + bpage = alloc_cpu_page(cpu); if (!bpage) - goto fail_free_buffer; + return NULL; rb_check_bpage(cpu_buffer, bpage); @@ -2228,14 +2420,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) + int order = cpu_buffer->buffer->subbuf_order; + bpage->page = alloc_cpu_data(cpu, order); + if (!bpage->page) goto fail_free_reader; - bpage->page = page_address(page); - rb_init_page(bpage->page); } INIT_LIST_HEAD(&cpu_buffer->reader_page->list); @@ -2275,13 +2488,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) rb_head_page_activate(cpu_buffer); } - return cpu_buffer; + return_ptr(cpu_buffer); fail_free_reader: free_buffer_page(cpu_buffer->reader_page); - fail_free_buffer: - kfree(cpu_buffer); return NULL; } @@ -2292,6 +2503,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2310,12 +2524,24 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) kfree(cpu_buffer); } +/* Stop recording on a persistent buffer and flush cache if needed. */ +static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data) +{ + struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb); + + ring_buffer_record_off(buffer); + arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end); + return NOTIFY_DONE; +} + static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, - struct lock_class_key *key) + unsigned long scratch_size, + struct lock_class_key *key, + struct ring_buffer_remote *remote) { - struct trace_buffer *buffer; + struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; int subbuf_size; int bsize; @@ -2329,7 +2555,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, return NULL; if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) - goto fail_free_buffer; + return NULL; buffer->subbuf_order = order; subbuf_size = (PAGE_SIZE << order); @@ -2353,12 +2579,27 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { + unsigned long buffers_start; unsigned long ptr; int n; - size = end - start; + /* Make sure that start is word aligned */ + start = ALIGN(start, sizeof(long)); + + /* scratch_size needs to be aligned too */ + scratch_size = ALIGN(scratch_size, sizeof(long)); + + /* Subtract the buffer meta data and word aligned */ + buffers_start = start + sizeof(struct ring_buffer_cpu_meta); + buffers_start = ALIGN(buffers_start, sizeof(long)); + buffers_start += scratch_size; + + /* Calculate the size for the per CPU data */ + size = end - buffers_start; size = size / nr_cpu_ids; /* @@ -2368,7 +2609,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, * needed, plus account for the integer array index that * will be appended to the meta data. */ - nr_pages = (size - sizeof(struct ring_buffer_meta)) / + nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) / (subbuf_size + sizeof(int)); /* Need at least two pages plus the reader page */ if (nr_pages < 3) @@ -2376,8 +2617,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, again: /* Make sure that the size fits aligned */ - for (n = 0, ptr = start; n < nr_cpu_ids; n++) { - ptr += sizeof(struct ring_buffer_meta) + + for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) { + ptr += sizeof(struct ring_buffer_cpu_meta) + sizeof(int) * nr_pages; ptr = ALIGN(ptr, subbuf_size); ptr += subbuf_size * nr_pages; @@ -2394,7 +2635,16 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_start = start; buffer->range_addr_end = end; - rb_range_meta_init(buffer, nr_pages); + rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2403,7 +2653,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2415,7 +2664,13 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, mutex_init(&buffer->mutex); - return buffer; + /* Persistent ring buffer needs to flush cache before reboot. */ + if (start && end) { + buffer->flush_nb.notifier_call = rb_flush_buffer_cb; + atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb); + } + + return_ptr(buffer); fail_free_buffers: for_each_buffer_cpu(buffer, cpu) { @@ -2427,8 +2682,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, fail_free_cpumask: free_cpumask_var(buffer->cpumask); - fail_free_buffer: - kfree(buffer); return NULL; } @@ -2447,7 +2700,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0,key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2459,6 +2712,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); * @order: sub-buffer order * @start: start of allocated range * @range_size: size of allocated range + * @scratch_size: size of scratch area (for preallocated memory buffers) * @key: ring buffer reader_lock_key. * * Currently the only flag that is available is the RB_FL_OVERWRITE @@ -2469,32 +2723,40 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc); struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long range_size, + unsigned long scratch_size, struct lock_class_key *key) { - return alloc_buffer(size, flags, order, start, start + range_size, key); + return alloc_buffer(size, flags, order, start, start + range_size, + scratch_size, key, NULL); } /** - * ring_buffer_last_boot_delta - return the delta offset from last boot - * @buffer: The buffer to return the delta from - * @text: Return text delta - * @data: Return data delta - * - * Returns: The true if the delta is non zero + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. */ -bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, - long *data) +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) { - if (!buffer) - return false; + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); +} - if (!buffer->last_text_delta) - return false; +void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) +{ + struct ring_buffer_meta *meta; + void *ptr; - *text = buffer->last_text_delta; - *data = buffer->last_data_delta; + if (!buffer || !buffer->meta) + return NULL; - return true; + meta = buffer->meta; + + ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long)); + + if (size) + *size = (void *)meta + meta->buffers_offset - ptr; + + return ptr; } /** @@ -2506,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer) { int cpu; + if (buffer->range_addr_start && buffer->range_addr_end) + atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb); + cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node); irq_work_sync(&buffer->irq_work.work); @@ -2794,6 +3059,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; + /* + * Keep CPUs from coming online while resizing to synchronize + * with new per CPU buffers being created. + */ + guard(cpus_read_lock)(); + /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); atomic_inc(&buffer->resizing); @@ -2838,7 +3109,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cond_resched(); } - cpus_read_lock(); /* * Fire off all the required work handlers * We can't schedule on offline CPUs, but it's not necessary @@ -2878,7 +3148,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, cpu_buffer->nr_pages_to_update = 0; } - cpus_read_unlock(); } else { cpu_buffer = buffer->buffers[cpu_id]; @@ -2906,8 +3175,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, goto out_err; } - cpus_read_lock(); - /* Can't run something on an offline CPU. */ if (!cpu_online(cpu_id)) rb_update_pages(cpu_buffer); @@ -2926,7 +3193,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } cpu_buffer->nr_pages_to_update = 0; - cpus_read_unlock(); } out: @@ -2971,6 +3237,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, list) { list_del_init(&bpage->list); free_buffer_page(bpage); + + cond_resched(); } } out_err_unlock: @@ -3105,7 +3373,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) } /* Return the index into the sub-buffers for a given sub-buffer */ -static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) +static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf) { void *subbuf_array; @@ -3117,7 +3385,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf) static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *next_page) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; unsigned long old_head = (unsigned long)next_page->page; unsigned long new_head; @@ -3134,7 +3402,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer, static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *reader) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; void *old_reader = cpu_buffer->reader_page->page; void *new_reader = reader->page; int id; @@ -3763,7 +4031,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) rb_page_write(cpu_buffer->commit_page)); rb_inc_page(&cpu_buffer->commit_page); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page; } /* add barrier to keep gcc from optimizing too much */ @@ -3845,19 +4113,36 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer) rb_end_commit(cpu_buffer); } +static bool +rb_irq_work_queue(struct rb_irq_work *irq_work) +{ + int cpu; + + /* irq_work_queue_on() is not NMI-safe */ + if (unlikely(in_nmi())) + return irq_work_queue(&irq_work->work); + + /* + * If CPU isolation is not active, cpu is always the current + * CPU, and the following is equivallent to irq_work_queue(). + */ + cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE); + return irq_work_queue_on(&irq_work->work, cpu); +} + static __always_inline void rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) { if (buffer->irq_work.waiters_pending) { buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&buffer->irq_work.work); + rb_irq_work_queue(&buffer->irq_work); } if (cpu_buffer->irq_work.waiters_pending) { cpu_buffer->irq_work.waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + rb_irq_work_queue(&cpu_buffer->irq_work); } if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched)) @@ -3877,7 +4162,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->irq_work.wakeup_full = true; cpu_buffer->irq_work.full_waiters_pending = false; /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&cpu_buffer->irq_work.work); + rb_irq_work_queue(&cpu_buffer->irq_work); } #ifdef CONFIG_RING_BUFFER_RECORD_RECURSION @@ -4066,7 +4351,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static const char *show_irq_str(int bits) { - const char *type[] = { + static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 @@ -4244,18 +4529,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta); if (ret < 0) { if (delta < ts) { - buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n", - cpu_buffer->cpu, ts, delta); + buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n", + cpu_buffer->cpu, ts, delta, + cpu_buffer->buffer->clock); goto out; } } if ((full && ts > info->ts) || (!full && ts + info->delta != info->ts)) { - buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n", + buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS", cpu_buffer->cpu, ts + info->delta, info->ts, info->delta, info->before, info->after, - full ? " (full)" : "", show_interrupt_level()); + full ? " (full)" : "", show_interrupt_level(), + cpu_buffer->buffer->clock); } out: atomic_dec(this_cpu_ptr(&checking)); @@ -4629,10 +4916,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); rb_decrement_entry(cpu_buffer, event); - if (rb_try_to_discard(cpu_buffer, event)) - goto out; - - out: + rb_try_to_discard(cpu_buffer, event); rb_end_commit(cpu_buffer); trace_recursive_unlock(cpu_buffer); @@ -4665,26 +4949,26 @@ int ring_buffer_write(struct trace_buffer *buffer, int ret = -EBUSY; int cpu; - preempt_disable_notrace(); + guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) - goto out; + return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) - goto out; + return -EBUSY; if (length > buffer->max_data_size) - goto out; + return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) - goto out; + return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) @@ -4702,10 +4986,6 @@ int ring_buffer_write(struct trace_buffer *buffer, out_unlock: trace_recursive_unlock(cpu_buffer); - - out: - preempt_enable_notrace(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); @@ -4830,6 +5110,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer) } /** + * ring_buffer_record_is_on_cpu - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * @cpu: The CPU to test if the ring buffer can write too + * + * Returns true if the ring buffer is in a state that it accepts writes + * for a particular CPU. + */ +bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = buffer->buffers[cpu]; + + return ring_buffer_record_is_set_on(buffer) && + !atomic_read(&cpu_buffer->record_disabled); +} + +/** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. * @cpu: The CPU buffer to stop @@ -5072,14 +5370,66 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + +static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *next, *orig; + int retry = 3; + + orig = next = cpu_buffer->head_page; + rb_inc_page(&next); + + /* Run after the writer */ + while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) { + rb_inc_page(&next); + + rb_list_head_clear(cpu_buffer->head_page->list.prev); + rb_inc_page(&cpu_buffer->head_page); + rb_set_list_to_head(cpu_buffer->head_page->list.prev); + + if (cpu_buffer->head_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } + + orig = cpu_buffer->commit_page = cpu_buffer->head_page; + retry = 3; + + while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) { + rb_inc_page(&next); + rb_inc_page(&cpu_buffer->commit_page); + + if (cpu_buffer->commit_page == orig) { + if (WARN_ON_ONCE(!(--retry))) + return; + } + } +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + if (cpu_buffer->remote) { + rb_read_remote_meta_page(cpu_buffer); + rb_update_remote_head(cpu_buffer); + } + /* Iterator usage is expected to have record disabled */ iter->head_page = cpu_buffer->reader_page; iter->head = cpu_buffer->reader_page->read; iter->next_event = iter->head; + iter->missed_events = 0; iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; @@ -5226,7 +5576,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + prev_head = new_reader; /* New reader was also the previous head */ + new_head = prev_head; + rb_inc_page(&new_head); + last = prev_head; + rb_dec_page(&last); + + /* Clear the old HEAD flag */ + rb_list_head_clear(cpu_buffer->head_page->list.prev); + + prev_reader->list.next = prev_head->list.next; + prev_reader->list.prev = prev_head->list.prev; + + /* Swap prev_reader with new_reader */ + last->list.next = &prev_reader->list; + new_head->list.prev = &prev_reader->list; + + new_reader->list.prev = &new_reader->list; + new_reader->list.next = &new_head->list; + + /* Reactivate the HEAD flag */ + rb_set_list_to_head(&last->list); + + cpu_buffer->head_page = new_head; + cpu_buffer->reader_page = new_reader; + cpu_buffer->pages = &new_head->list; + cpu_buffer->read_stamp = new_reader->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5275,7 +5683,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: @@ -5318,7 +5725,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) * moving it. The page before the header page has the * flag bit '1' set if it is pointing to the page we want. * but if the writer is in the process of moving it - * than it will be '2' or already moved '0'. + * then it will be '2' or already moved '0'. */ ret = rb_head_page_replace(reader, cpu_buffer->reader_page); @@ -5397,6 +5804,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -5695,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts, */ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter) { - bool ret = iter->missed_events != 0; - - iter->missed_events = 0; - return ret; + return iter->missed_events != 0; } EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped); @@ -5779,24 +6190,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. - * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5804,7 +6211,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kzalloc(sizeof(*iter), flags); + iter = kzalloc_obj(*iter, flags); if (!iter) return NULL; @@ -5822,51 +6229,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -5903,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter) unsigned long flags; raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); - + iter->missed_events = 0; rb_advance_iter(iter); raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); @@ -5947,6 +6315,39 @@ static void rb_clear_buffer_page(struct buffer_page *page) page->read = 0; } +/* + * When the buffer is memory mapped to user space, each sub buffer + * has a unique id that is used by the meta data to tell the user + * where the current reader page is. + * + * For a normal allocated ring buffer, the id is saved in the buffer page + * id field, and updated via this function. + * + * But for a fixed memory mapped buffer, the id is already assigned for + * fixed memory ordering in the memory layout and can not be used. Instead + * the index of where the page lies in the memory layout is used. + * + * For the normal pages, set the buffer page id with the passed in @id + * value and return that. + * + * For fixed memory mapped pages, get the page index in the memory layout + * and return that as the id. + */ +static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage, int id) +{ + /* + * For boot buffers, the id is the index, + * otherwise, set the buffer page with this id + */ + if (cpu_buffer->ring_meta) + id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page); + else + bpage->id = id; + + return id; +} + static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; @@ -5955,15 +6356,19 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer) return; meta->reader.read = cpu_buffer->reader_page->read; - meta->reader.id = cpu_buffer->reader_page->id; + meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, + cpu_buffer->reader_page->id); + meta->reader.lost_events = cpu_buffer->lost_events; meta->entries = local_read(&cpu_buffer->entries); meta->overrun = local_read(&cpu_buffer->overrun); meta->read = cpu_buffer->read; + meta->pages_lost = local_read(&cpu_buffer->pages_lost); + meta->pages_touched = local_read(&cpu_buffer->pages_touched); /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page)); + flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE); } static void @@ -5971,6 +6376,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6016,7 +6438,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) if (cpu_buffer->mapped) { rb_update_meta_page(cpu_buffer); if (cpu_buffer->ring_meta) { - struct ring_buffer_meta *meta = cpu_buffer->ring_meta; + struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; meta->commit_buffer = meta->head_buffer; } } @@ -6025,21 +6447,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) /* Must have disabled the cpu buffer then done a synchronize_rcu */ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) - goto out; + return; arch_spin_lock(&cpu_buffer->lock); rb_reset_cpu(cpu_buffer); arch_spin_unlock(&cpu_buffer->lock); - - out: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } /** @@ -6050,7 +6467,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; - struct ring_buffer_meta *meta; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; @@ -6069,11 +6485,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) atomic_dec(&cpu_buffer->record_disabled); atomic_dec(&cpu_buffer->resize_disabled); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - mutex_unlock(&buffer->mutex); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); @@ -6088,7 +6499,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { struct ring_buffer_per_cpu *cpu_buffer; - struct ring_buffer_meta *meta; int cpu; /* prevent another thread from changing buffer sizes */ @@ -6116,11 +6526,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) reset_disabled_cpu_buffer(cpu_buffer); - /* Make sure persistent meta now uses this buffer's addresses */ - meta = rb_range_meta(buffer, 0, cpu_buffer->cpu); - if (meta) - rb_meta_init_text_addr(meta); - atomic_dec(&cpu_buffer->record_disabled); atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled); } @@ -6218,6 +6623,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6239,37 +6684,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || !cpumask_test_cpu(cpu, buffer_b->cpumask)) - goto out; + return -EINVAL; cpu_buffer_a = buffer_a->buffers[cpu]; cpu_buffer_b = buffer_b->buffers[cpu]; /* It's up to the callers to not try to swap mapped buffers */ - if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) { - ret = -EBUSY; - goto out; - } + if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) + return -EBUSY; /* At least make sure the two buffers are somewhat the same */ if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages) - goto out; + return -EINVAL; if (buffer_a->subbuf_order != buffer_b->subbuf_order) - goto out; - - ret = -EAGAIN; + return -EINVAL; if (atomic_read(&buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&buffer_b->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_a->record_disabled)) - goto out; + return -EAGAIN; if (atomic_read(&cpu_buffer_b->record_disabled)) - goto out; + return -EAGAIN; /* * We can't do a synchronize_rcu here because this @@ -6306,7 +6747,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, out_dec: atomic_dec(&cpu_buffer_a->record_disabled); atomic_dec(&cpu_buffer_b->record_disabled); -out: return ret; } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); @@ -6334,12 +6774,11 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_read_page *bpage = NULL; unsigned long flags; - struct page *page; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return ERR_PTR(-ENODEV); - bpage = kzalloc(sizeof(*bpage), GFP_KERNEL); + bpage = kzalloc_obj(*bpage); if (!bpage) return ERR_PTR(-ENOMEM); @@ -6356,22 +6795,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); - if (bpage->data) - goto out; - - page = alloc_pages_node(cpu_to_node(cpu), - GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO, - cpu_buffer->buffer->subbuf_order); - if (!page) { - kfree(bpage); - return ERR_PTR(-ENOMEM); + if (bpage->data) { + rb_init_page(bpage->data); + } else { + bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order); + if (!bpage->data) { + kfree(bpage); + return ERR_PTR(-ENOMEM); + } } - bpage->data = page_address(page); - - out: - rb_init_page(bpage->data); - return bpage; } EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); @@ -6465,38 +6898,38 @@ int ring_buffer_read_page(struct trace_buffer *buffer, struct buffer_data_page *bpage; struct buffer_page *reader; unsigned long missed_events; - unsigned long flags; unsigned int commit; unsigned int read; u64 save_timestamp; - int ret = -1; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -1; /* * If len is not big enough to hold the page header, then * we can not copy anything. */ if (len <= BUF_PAGE_HDR_SIZE) - goto out; + return -1; len -= BUF_PAGE_HDR_SIZE; if (!data_page || !data_page->data) - goto out; + return -1; + if (data_page->order != buffer->subbuf_order) - goto out; + return -1; bpage = data_page->data; if (!bpage) - goto out; + return -1; - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); reader = rb_get_reader_page(cpu_buffer); if (!reader) - goto out_unlock; + return -1; event = rb_reader_event(cpu_buffer); @@ -6506,6 +6939,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6515,7 +6950,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -6530,7 +6965,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (full && (!read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page)) - goto out_unlock; + return -1; if (len > (commit - read)) len = (commit - read); @@ -6539,7 +6974,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, size = rb_event_ts_length(event); if (len < size) - goto out_unlock; + return -1; /* save the current timestamp, since the user will need it */ save_timestamp = cpu_buffer->read_stamp; @@ -6597,7 +7032,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (reader->real_end) local_set(&bpage->commit, reader->real_end); } - ret = read; cpu_buffer->lost_events = 0; @@ -6624,11 +7058,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, if (commit < buffer->subbuf_size) memset(&bpage->data[commit], 0, buffer->subbuf_size - commit); - out_unlock: - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - out: - return ret; + return read; } EXPORT_SYMBOL_GPL(ring_buffer_read_page); @@ -6721,7 +7151,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) old_size = buffer->subbuf_size; /* prevent another thread from changing buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); atomic_inc(&buffer->record_disabled); /* Make sure all commits have finished */ @@ -6826,7 +7256,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order) } atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); return 0; @@ -6835,7 +7264,6 @@ error: buffer->subbuf_size = old_size; atomic_dec(&buffer->record_disabled); - mutex_unlock(&buffer->mutex); for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; @@ -6878,29 +7306,35 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer) } static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer, - unsigned long *subbuf_ids) + struct buffer_page **subbuf_ids) { struct trace_buffer_meta *meta = cpu_buffer->meta_page; unsigned int nr_subbufs = cpu_buffer->nr_pages + 1; struct buffer_page *first_subbuf, *subbuf; + int cnt = 0; int id = 0; - subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page; - cpu_buffer->reader_page->id = id++; + id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id); + subbuf_ids[id++] = cpu_buffer->reader_page; + cnt++; first_subbuf = subbuf = rb_set_head_page(cpu_buffer); do { + id = rb_page_id(cpu_buffer, subbuf, id); + if (WARN_ON(id >= nr_subbufs)) break; - subbuf_ids[id] = (unsigned long)subbuf->page; - subbuf->id = id; + subbuf_ids[id] = subbuf; rb_inc_page(&subbuf); id++; + cnt++; } while (subbuf != first_subbuf); - /* install subbuf ID to kern VA translation */ + WARN_ON(cnt != nr_subbufs); + + /* install subbuf ID to bpage translation */ cpu_buffer->subbuf_ids = subbuf_ids; meta->meta_struct_len = sizeof(*meta); @@ -6991,7 +7425,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, { unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff; unsigned int subbuf_pages, subbuf_order; - struct page **pages; + struct page **pages __free(kfree) = NULL; int p = 0, s = 0; int err; @@ -7028,7 +7462,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, nr_pages = nr_vma_pages; - pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + pages = kzalloc_objs(*pages, nr_pages); if (!pages) return -ENOMEM; @@ -7056,15 +7490,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, } while (p < nr_pages) { + struct buffer_page *subbuf; struct page *page; int off = 0; - if (WARN_ON_ONCE(s >= nr_subbufs)) { - err = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(s >= nr_subbufs)) + return -EINVAL; - page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]); + subbuf = cpu_buffer->subbuf_ids[s]; + page = virt_to_page((void *)subbuf->page); for (; off < (1 << (subbuf_order)); off++, page++) { if (p >= nr_pages) @@ -7077,9 +7511,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -out: - kfree(pages); - return err; } #else @@ -7094,37 +7525,36 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, struct vm_area_struct *vma) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags, *subbuf_ids; - int err = 0; + struct buffer_page **subbuf_ids; + unsigned long flags; + int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (cpu_buffer->user_mapped) { err = __rb_map_vma(cpu_buffer, vma); if (!err) err = __rb_inc_dec_mapped(cpu_buffer, true); - mutex_unlock(&cpu_buffer->mapping_lock); return err; } /* prevent another thread from changing buffer/sub-buffer sizes */ - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); err = rb_alloc_meta_page(cpu_buffer); if (err) - goto unlock; + return err; - /* subbuf_ids include the reader while nr_pages does not */ + /* subbuf_ids includes the reader while nr_pages does not */ subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL); if (!subbuf_ids) { rb_free_meta_page(cpu_buffer); - err = -ENOMEM; - goto unlock; + return -ENOMEM; } atomic_inc(&cpu_buffer->resize_disabled); @@ -7152,35 +7582,50 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, atomic_dec(&cpu_buffer->resize_disabled); } -unlock: - mutex_unlock(&buffer->mutex); - mutex_unlock(&cpu_buffer->mapping_lock); - return err; } +/* + * This is called when a VMA is duplicated (e.g., on fork()) to increment + * the user_mapped counter without remapping pages. + */ +void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask))) + return; + + cpu_buffer = buffer->buffers[cpu]; + + guard(mutex)(&cpu_buffer->mapping_lock); + + if (cpu_buffer->user_mapped) + __rb_inc_dec_mapped(cpu_buffer, true); + else + WARN(1, "Unexpected buffer stat, it should be mapped"); +} + int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; unsigned long flags; - int err = 0; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; - mutex_lock(&cpu_buffer->mapping_lock); + guard(mutex)(&cpu_buffer->mapping_lock); if (!cpu_buffer->user_mapped) { - err = -ENODEV; - goto out; + return -ENODEV; } else if (cpu_buffer->user_mapped > 1) { __rb_inc_dec_mapped(cpu_buffer, false); - goto out; + return 0; } - mutex_lock(&buffer->mutex); + guard(mutex)(&buffer->mutex); raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); /* This is the last user space mapping */ @@ -7195,12 +7640,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu) rb_free_meta_page(cpu_buffer); atomic_dec(&cpu_buffer->resize_disabled); - mutex_unlock(&buffer->mutex); - -out: - mutex_unlock(&cpu_buffer->mapping_lock); - - return err; + return 0; } int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu) @@ -7234,6 +7674,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; @@ -7241,8 +7685,8 @@ consume: /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; - if (cpu_buffer->reader_page != cpu_buffer->commit_page) { - if (missed_events) { + if (missed_events) { + if (cpu_buffer->reader_page != cpu_buffer->commit_page) { struct buffer_data_page *bpage = reader->page; unsigned int commit; /* @@ -7263,13 +7707,23 @@ consume: local_add(RB_MISSED_STORED, &bpage->commit); } local_add(RB_MISSED_EVENTS, &bpage->commit); + } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page, + "Reader on commit with %ld missed events", + missed_events)) { + /* + * There shouldn't be any missed events if the tail_page + * is on the reader page. But if the tail page is not on the + * reader page and the commit_page is, that would mean that + * there's a commit_overrun (an interrupt preempted an + * addition of an event and then filled the buffer + * with new events). In this case it's not an + * error, but it should still be reported. + * + * TODO: Add missed events to the page for user space to know. + */ + pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n", + cpu, missed_events, cpu_buffer->reader_page->page->time_stamp); } - } else { - /* - * There really shouldn't be any missed events if the commit - * is on the reader page. - */ - WARN_ON_ONCE(missed_events); } cpu_buffer->lost_events = 0; @@ -7278,7 +7732,8 @@ consume: out: /* Some archs do not have data cache coherency between kernel and user-space */ - flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page)); + flush_kernel_vmap_range(cpu_buffer->reader_page->page, + buffer->subbuf_size + BUF_PAGE_HDR_SIZE); rb_update_meta_page(cpu_buffer); @@ -7288,6 +7743,12 @@ out: return 0; } +static void rb_cpu_sync(void *data) +{ + /* Not really needed, but documents what is happening */ + smp_rmb(); +} + /* * We only allocate new buffers, never free them if the CPU goes down. * If we were to free the buffer, then the user would lose any trace that was in @@ -7326,7 +7787,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node) cpu); return -ENOMEM; } - smp_wmb(); + + /* + * Ensure trace_buffer readers observe the newly allocated + * ring_buffer_per_cpu before they check the cpumask. Instead of using a + * read barrier for all readers, send an IPI. + */ + if (unlikely(system_state == SYSTEM_RUNNING)) { + on_each_cpu(rb_cpu_sync, NULL, 1); + /* Not really needed, but documents what is happening */ + smp_wmb(); + } + cpumask_set_cpu(cpu, buffer->cpumask); return 0; } @@ -7411,9 +7883,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) /* Ignore dropped events before test starts. */ if (started) { if (nested) - data->bytes_dropped += len; - else data->bytes_dropped_nested += len; + else + data->bytes_dropped += len; } return len; } @@ -7535,7 +8007,7 @@ static __init int test_ringbuffer(void) /* * Show buffer is enabled before setting rb_test_started. * Yes there's a small race window where events could be - * dropped and the thread wont catch it. But when a ring + * dropped and the thread won't catch it. But when a ring * buffer gets enabled, there will always be some kind of * delay before other CPUs see it. Thus, we don't care about * those dropped events. We care about events dropped after @@ -7545,7 +8017,7 @@ static __init int test_ringbuffer(void) rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); - /* Just run for 10 seconds */; + /* Just run for 10 seconds */ schedule_timeout(10 * HZ); kthread_stop(rb_hammer); |
