summaryrefslogtreecommitdiff
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c1252
1 files changed, 862 insertions, 390 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb6089c2951e..7b07d2004cc6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,7 +4,10 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ring_buffer_types.h>
+#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
+#include <linux/panic_notifier.h>
#include <linux/trace_events.h>
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
@@ -29,8 +32,10 @@
#include <linux/oom.h>
#include <linux/mm.h>
+#include <asm/ring_buffer.h>
#include <asm/local64.h>
#include <asm/local.h>
+#include <asm/setup.h>
#include "trace.h"
@@ -48,9 +53,12 @@ static void update_pages_handler(struct work_struct *work);
struct ring_buffer_meta {
int magic;
- int struct_size;
- unsigned long text_addr;
- unsigned long data_addr;
+ int struct_sizes;
+ unsigned long total_size;
+ unsigned long buffers_offset;
+};
+
+struct ring_buffer_cpu_meta {
unsigned long first_buffer;
unsigned long head_buffer;
unsigned long commit_buffer;
@@ -152,23 +160,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
-#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
-
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -311,10 +302,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
-#define TS_SHIFT 27
-#define TS_MASK ((1ULL << TS_SHIFT) - 1)
-#define TS_DELTA_TEST (~TS_MASK)
-
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -333,12 +320,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
#define RB_MISSED_MASK (3 << 30)
-struct buffer_data_page {
- u64 time_stamp; /* page time stamp */
- local_t commit; /* write committed index */
- unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
-};
-
struct buffer_data_read_page {
unsigned order; /* order of the page */
struct buffer_data_page *data; /* actual data, stored in this page */
@@ -398,11 +379,38 @@ static void free_buffer_page(struct buffer_page *bpage)
}
/*
- * We need to fit the time_stamp delta into 27 bits.
+ * For best performance, allocate cpu buffer data cache line sized
+ * and per CPU.
*/
-static inline bool test_time_stamp(u64 delta)
+#define alloc_cpu_buffer(cpu) (struct ring_buffer_per_cpu *) \
+ kzalloc_node(ALIGN(sizeof(struct ring_buffer_per_cpu), \
+ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu));
+
+#define alloc_cpu_page(cpu) (struct buffer_page *) \
+ kzalloc_node(ALIGN(sizeof(struct buffer_page), \
+ cache_line_size()), GFP_KERNEL, cpu_to_node(cpu));
+
+static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
{
- return !!(delta & TS_DELTA_TEST);
+ struct buffer_data_page *dpage;
+ struct page *page;
+ gfp_t mflags;
+
+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_COMP | __GFP_ZERO;
+
+ page = alloc_pages_node(cpu_to_node(cpu), mflags, order);
+ if (!page)
+ return NULL;
+
+ dpage = page_address(page);
+ rb_init_page(dpage);
+
+ return dpage;
}
struct rb_irq_work {
@@ -515,9 +523,11 @@ struct ring_buffer_per_cpu {
unsigned int mapped;
unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
- unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct buffer_page **subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
- struct ring_buffer_meta *ring_meta;
+ struct ring_buffer_cpu_meta *ring_meta;
+
+ struct ring_buffer_remote *remote;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -541,6 +551,8 @@ struct trace_buffer {
struct ring_buffer_per_cpu **buffers;
+ struct ring_buffer_remote *remote;
+
struct hlist_node node;
u64 (*clock)(void);
@@ -549,9 +561,9 @@ struct trace_buffer {
unsigned long range_addr_start;
unsigned long range_addr_end;
+ struct notifier_block flush_nb;
- long last_text_delta;
- long last_data_delta;
+ struct ring_buffer_meta *meta;
unsigned int subbuf_size;
unsigned int subbuf_order;
@@ -588,16 +600,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
- trace_seq_printf(s, "\tfield: int overwrite;\t"
+ trace_seq_printf(s, "\tfield: char overwrite;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
1,
- (unsigned int)is_signed_type(long));
+ (unsigned int)is_signed_type(char));
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)buffer->subbuf_size,
+ (unsigned int)(buffer ? buffer->subbuf_size :
+ PAGE_SIZE - BUF_PAGE_HDR_SIZE),
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -1271,7 +1284,7 @@ static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
rb_set_list_to_head(head->list.prev);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->head_buffer = (unsigned long)head->page;
}
}
@@ -1355,6 +1368,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
+static inline void rb_dec_page(struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.prev);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -1569,7 +1589,7 @@ out_locked:
static unsigned long
rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
{
- addr += sizeof(struct ring_buffer_meta) +
+ addr += sizeof(struct ring_buffer_cpu_meta) +
sizeof(int) * nr_subbufs;
return ALIGN(addr, subbuf_size);
}
@@ -1580,19 +1600,22 @@ rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
{
int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
- unsigned long ptr = buffer->range_addr_start;
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
+ struct ring_buffer_meta *bmeta;
+ unsigned long ptr;
int nr_subbufs;
- if (!ptr)
+ bmeta = buffer->meta;
+ if (!bmeta)
return NULL;
+ ptr = (unsigned long)bmeta + bmeta->buffers_offset;
+ meta = (struct ring_buffer_cpu_meta *)ptr;
+
/* When nr_pages passed in is zero, the first meta has already been initialized */
if (!nr_pages) {
- meta = (struct ring_buffer_meta *)ptr;
nr_subbufs = meta->nr_subbufs;
} else {
- meta = NULL;
/* Include the reader page */
nr_subbufs = nr_pages + 1;
}
@@ -1624,7 +1647,7 @@ static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
}
/* Return the start of subbufs given the meta pointer */
-static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+static void *rb_subbufs_from_meta(struct ring_buffer_cpu_meta *meta)
{
int subbuf_size = meta->subbuf_size;
unsigned long ptr;
@@ -1640,7 +1663,7 @@ static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
*/
static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
{
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
unsigned long ptr;
int subbuf_size;
@@ -1666,14 +1689,77 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
}
/*
+ * See if the existing memory contains a valid meta section.
+ * if so, use that, otherwise initialize it.
+ */
+static bool rb_meta_init(struct trace_buffer *buffer, int scratch_size)
+{
+ unsigned long ptr = buffer->range_addr_start;
+ struct ring_buffer_meta *bmeta;
+ unsigned long total_size;
+ int struct_sizes;
+
+ bmeta = (struct ring_buffer_meta *)ptr;
+ buffer->meta = bmeta;
+
+ total_size = buffer->range_addr_end - buffer->range_addr_start;
+
+ struct_sizes = sizeof(struct ring_buffer_cpu_meta);
+ struct_sizes |= sizeof(*bmeta) << 16;
+
+ /* The first buffer will start word size after the meta page */
+ ptr += sizeof(*bmeta);
+ ptr = ALIGN(ptr, sizeof(long));
+ ptr += scratch_size;
+
+ if (bmeta->magic != RING_BUFFER_META_MAGIC) {
+ pr_info("Ring buffer boot meta mismatch of magic\n");
+ goto init;
+ }
+
+ if (bmeta->struct_sizes != struct_sizes) {
+ pr_info("Ring buffer boot meta mismatch of struct size\n");
+ goto init;
+ }
+
+ if (bmeta->total_size != total_size) {
+ pr_info("Ring buffer boot meta mismatch of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset > bmeta->total_size) {
+ pr_info("Ring buffer boot meta mismatch of offset outside of total size\n");
+ goto init;
+ }
+
+ if (bmeta->buffers_offset != (void *)ptr - (void *)bmeta) {
+ pr_info("Ring buffer boot meta mismatch of first buffer offset\n");
+ goto init;
+ }
+
+ return true;
+
+ init:
+ bmeta->magic = RING_BUFFER_META_MAGIC;
+ bmeta->struct_sizes = struct_sizes;
+ bmeta->total_size = total_size;
+ bmeta->buffers_offset = (void *)ptr - (void *)bmeta;
+
+ /* Zero out the scratch pad */
+ memset((void *)bmeta + sizeof(*bmeta), 0, bmeta->buffers_offset - sizeof(*bmeta));
+
+ return false;
+}
+
+/*
* See if the existing memory contains valid ring buffer data.
* As the previous kernel must be the same as this kernel, all
* the calculations (size of buffers and number of buffers)
* must be the same.
*/
-static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
- struct trace_buffer *buffer, int nr_pages,
- unsigned long *subbuf_mask)
+static bool rb_cpu_meta_valid(struct ring_buffer_cpu_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
struct buffer_data_page *subbuf;
@@ -1684,20 +1770,6 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
if (!subbuf_mask)
return false;
- /* Check the meta magic and meta struct size */
- if (meta->magic != RING_BUFFER_META_MAGIC ||
- meta->struct_size != sizeof(*meta)) {
- pr_info("Ring buffer boot meta[%d] mismatch of magic or struct size\n", cpu);
- return false;
- }
-
- /* The subbuffer's size and number of subbuffers must match */
- if (meta->subbuf_size != subbuf_size ||
- meta->nr_subbufs != nr_pages + 1) {
- pr_info("Ring buffer boot meta [%d] mismatch of subbuf_size/nr_pages\n", cpu);
- return false;
- }
-
buffers_start = meta->first_buffer;
buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
@@ -1743,7 +1815,7 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
return true;
}
-static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf);
static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu,
unsigned long long *timestamp, u64 *delta_ptr)
@@ -1751,6 +1823,7 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
struct ring_buffer_event *event;
u64 ts, delta;
int events = 0;
+ int len;
int e;
*delta_ptr = 0;
@@ -1758,9 +1831,12 @@ static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int cpu
ts = dpage->time_stamp;
- for (e = 0; e < tail; e += rb_event_length(event)) {
+ for (e = 0; e < tail; e += len) {
event = (struct ring_buffer_event *)(dpage->data + e);
+ len = rb_event_length(event);
+ if (len <= 0 || len > tail - e)
+ return -1;
switch (event->type_len) {
@@ -1810,38 +1886,133 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
/* If the meta data has been validated, now validate the events */
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
- struct buffer_page *head_page;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
+ struct buffer_page *head_page, *orig_head, *orig_reader;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
+ u64 ts;
int i;
if (!meta || !meta->head_buffer)
return;
+ orig_head = head_page = cpu_buffer->head_page;
+ orig_reader = cpu_buffer->reader_page;
+
/* Do the reader page first */
- ret = rb_validate_buffer(cpu_buffer->reader_page->page, cpu_buffer->cpu);
+ ret = rb_validate_buffer(orig_reader->page, cpu_buffer->cpu);
if (ret < 0) {
pr_info("Ring buffer reader page is invalid\n");
goto invalid;
}
entries += ret;
- entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
- local_set(&cpu_buffer->reader_page->entries, ret);
+ entry_bytes += local_read(&orig_reader->page->commit);
+ local_set(&orig_reader->entries, ret);
- head_page = cpu_buffer->head_page;
+ ts = head_page->page->time_stamp;
+
+ /*
+ * Try to rewind the head so that we can read the pages which already
+ * read in the previous boot.
+ */
+ if (head_page == cpu_buffer->tail_page)
+ goto skip_rewind;
+
+ rb_dec_page(&head_page);
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
+
+ /* Rewind until tail (writer) page. */
+ if (head_page == cpu_buffer->tail_page)
+ break;
+
+ /* Ensure the page has older data than head. */
+ if (ts < head_page->page->time_stamp)
+ break;
+
+ ts = head_page->page->time_stamp;
+ /* Ensure the page has correct timestamp and some data. */
+ if (!ts || rb_page_commit(head_page) == 0)
+ break;
+
+ /* Stop rewind if the page is invalid. */
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0)
+ break;
+
+ /* Recover the number of entries and update stats. */
+ local_set(&head_page->entries, ret);
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+ entries += ret;
+ entry_bytes += rb_page_commit(head_page);
+ }
+ if (i)
+ pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
+
+ /* The last rewound page must be skipped. */
+ if (head_page != orig_head)
+ rb_inc_page(&head_page);
- /* If both the head and commit are on the reader_page then we are done. */
- if (head_page == cpu_buffer->reader_page &&
- head_page == cpu_buffer->commit_page)
+ /*
+ * If the ring buffer was rewound, then inject the reader page
+ * into the location just before the original head page.
+ */
+ if (head_page != orig_head) {
+ struct buffer_page *bpage = orig_head;
+
+ rb_dec_page(&bpage);
+ /*
+ * Insert the reader_page before the original head page.
+ * Since the list encode RB_PAGE flags, general list
+ * operations should be avoided.
+ */
+ cpu_buffer->reader_page->list.next = &orig_head->list;
+ cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+ orig_head->list.prev = &cpu_buffer->reader_page->list;
+ bpage->list.next = &cpu_buffer->reader_page->list;
+
+ /* Make the head_page the reader page */
+ cpu_buffer->reader_page = head_page;
+ bpage = head_page;
+ rb_inc_page(&head_page);
+ head_page->list.prev = bpage->list.prev;
+ rb_dec_page(&bpage);
+ bpage->list.next = &head_page->list;
+ rb_set_list_to_head(&bpage->list);
+ cpu_buffer->pages = &head_page->list;
+
+ cpu_buffer->head_page = head_page;
+ meta->head_buffer = (unsigned long)head_page->page;
+
+ /* Reset all the indexes */
+ bpage = cpu_buffer->reader_page;
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = 0;
+
+ for (i = 1, bpage = head_page; i < meta->nr_subbufs;
+ i++, rb_inc_page(&bpage)) {
+ meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = i;
+ }
+
+ /* We'll restart verifying from orig_head */
+ head_page = orig_head;
+ }
+
+ skip_rewind:
+ /* If the commit_buffer is the reader page, update the commit page */
+ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
+ cpu_buffer->commit_page = cpu_buffer->reader_page;
+ /* Nothing more to do, the only page is the reader page */
goto done;
+ }
/* Iterate until finding the commit page */
for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
- /* Reader page has already been done */
- if (head_page == cpu_buffer->reader_page)
+ /* The original reader page has already been checked/counted. */
+ if (head_page == orig_reader)
continue;
ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
@@ -1857,7 +2028,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
- local_set(&cpu_buffer->head_page->entries, ret);
+ local_set(&head_page->entries, ret);
if (head_page == cpu_buffer->commit_page)
break;
@@ -1891,24 +2062,13 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
}
}
-/* Used to calculate data delta */
-static char rb_data_ptr[] = "";
-
-#define THIS_TEXT_PTR ((unsigned long)rb_meta_init_text_addr)
-#define THIS_DATA_PTR ((unsigned long)rb_data_ptr)
-
-static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
-{
- meta->text_addr = THIS_TEXT_PTR;
- meta->data_addr = THIS_DATA_PTR;
-}
-
-static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
+static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages, int scratch_size)
{
- struct ring_buffer_meta *meta;
+ struct ring_buffer_cpu_meta *meta;
unsigned long *subbuf_mask;
unsigned long delta;
void *subbuf;
+ bool valid = false;
int cpu;
int i;
@@ -1916,20 +2076,21 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
/* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+ if (rb_meta_init(buffer, scratch_size))
+ valid = true;
+
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *next_meta;
meta = rb_range_meta(buffer, nr_pages, cpu);
- if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
+ if (valid && rb_cpu_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
/* Make the mappings match the current address */
subbuf = rb_subbufs_from_meta(meta);
delta = (unsigned long)subbuf - meta->first_buffer;
meta->first_buffer += delta;
meta->head_buffer += delta;
meta->commit_buffer += delta;
- buffer->last_text_delta = THIS_TEXT_PTR - meta->text_addr;
- buffer->last_data_delta = THIS_DATA_PTR - meta->data_addr;
continue;
}
@@ -1940,16 +2101,12 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
memset(meta, 0, next_meta - (void *)meta);
- meta->magic = RING_BUFFER_META_MAGIC;
- meta->struct_size = sizeof(*meta);
-
meta->nr_subbufs = nr_pages + 1;
meta->subbuf_size = PAGE_SIZE;
subbuf = rb_subbufs_from_meta(meta);
meta->first_buffer = (unsigned long)subbuf;
- rb_meta_init_text_addr(meta);
/*
* The buffers[] array holds the order of the sub-buffers
@@ -1971,7 +2128,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
static void *rbm_start(struct seq_file *m, loff_t *pos)
{
struct ring_buffer_per_cpu *cpu_buffer = m->private;
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val;
if (!meta)
@@ -1996,7 +2153,7 @@ static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
static int rbm_show(struct seq_file *m, void *v)
{
struct ring_buffer_per_cpu *cpu_buffer = m->private;
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long val = (unsigned long)v;
if (val == 1) {
@@ -2045,7 +2202,7 @@ int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, in
static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
if (meta->head_buffer == (unsigned long)bpage->page)
cpu_buffer->head_page = bpage;
@@ -2056,14 +2213,48 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
+static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu)
+{
+ struct ring_buffer_desc *desc, *end;
+ size_t len;
+ int i;
+
+ if (!trace_desc)
+ return NULL;
+
+ if (cpu >= trace_desc->nr_cpus)
+ return NULL;
+
+ end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len);
+ desc = __first_ring_buffer_desc(trace_desc);
+ len = struct_size(desc, page_va, desc->nr_page_va);
+ desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu));
+
+ if (desc < end && desc->cpu == cpu)
+ return desc;
+
+ /* Missing CPUs, need to linear search */
+ for_each_ring_buffer_desc(desc, i, trace_desc) {
+ if (desc->cpu == cpu)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id)
+{
+ return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id];
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
struct trace_buffer *buffer = cpu_buffer->buffer;
- struct ring_buffer_meta *meta = NULL;
+ struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
- gfp_t mflags;
+ struct ring_buffer_desc *desc = NULL;
long i;
/*
@@ -2078,13 +2269,6 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
return -ENOMEM;
/*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
- mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
-
- /*
* If a user thread allocates too much, and si_mem_available()
* reports there's enough memory, even though there is not.
* Make sure the OOM killer kills this thread. This can happen
@@ -2099,11 +2283,15 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
if (buffer->range_addr_start)
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+ if (buffer->remote) {
+ desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu);
+ if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1)))
+ return -EINVAL;
+ }
+
for (i = 0; i < nr_pages; i++) {
- struct page *page;
- bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- mflags, cpu_to_node(cpu_buffer->cpu));
+ bpage = alloc_cpu_page(cpu_buffer->cpu);
if (!bpage)
goto free_pages;
@@ -2125,14 +2313,21 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
bpage->id = i + 1;
+ } else if (desc) {
+ void *p = ring_buffer_desc_page(desc, i + 1);
+
+ if (WARN_ON(!p))
+ goto free_pages;
+
+ bpage->page = p;
+ bpage->range = 1; /* bpage->page can't be freed */
+ bpage->id = i + 1;
+ cpu_buffer->subbuf_ids[i + 1] = bpage;
} else {
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
- mflags | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
+ int order = cpu_buffer->buffer->subbuf_order;
+ bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
+ if (!bpage->page)
goto free_pages;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
}
bpage->order = cpu_buffer->buffer->subbuf_order;
@@ -2183,14 +2378,12 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_meta *meta;
+ struct ring_buffer_per_cpu *cpu_buffer __free(kfree) =
+ alloc_cpu_buffer(cpu);
+ struct ring_buffer_cpu_meta *meta;
struct buffer_page *bpage;
- struct page *page;
int ret;
- cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
if (!cpu_buffer)
return NULL;
@@ -2206,10 +2399,9 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
mutex_init(&cpu_buffer->mapping_lock);
- bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
+ bpage = alloc_cpu_page(cpu);
if (!bpage)
- goto fail_free_buffer;
+ return NULL;
rb_check_bpage(cpu_buffer, bpage);
@@ -2228,14 +2420,35 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (cpu_buffer->ring_meta->head_buffer)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
+ } else if (buffer->remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu);
+
+ if (!desc)
+ goto fail_free_reader;
+
+ cpu_buffer->remote = buffer->remote;
+ cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va;
+ cpu_buffer->nr_pages = nr_pages;
+ cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1,
+ sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL);
+ if (!cpu_buffer->subbuf_ids)
+ goto fail_free_reader;
+
+ /* Remote buffers are read-only and immutable */
+ atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id);
+ if (!bpage->page)
+ goto fail_free_reader;
+
+ bpage->range = 1;
+ cpu_buffer->subbuf_ids[0] = bpage;
} else {
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page)
+ int order = cpu_buffer->buffer->subbuf_order;
+ bpage->page = alloc_cpu_data(cpu, order);
+ if (!bpage->page)
goto fail_free_reader;
- bpage->page = page_address(page);
- rb_init_page(bpage->page);
}
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -2275,13 +2488,11 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_head_page_activate(cpu_buffer);
}
- return cpu_buffer;
+ return_ptr(cpu_buffer);
fail_free_reader:
free_buffer_page(cpu_buffer->reader_page);
- fail_free_buffer:
- kfree(cpu_buffer);
return NULL;
}
@@ -2292,6 +2503,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
irq_work_sync(&cpu_buffer->irq_work.work);
+ if (cpu_buffer->remote)
+ kfree(cpu_buffer->subbuf_ids);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -2310,12 +2524,24 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
kfree(cpu_buffer);
}
+/* Stop recording on a persistent buffer and flush cache if needed. */
+static int rb_flush_buffer_cb(struct notifier_block *nb, unsigned long event, void *data)
+{
+ struct trace_buffer *buffer = container_of(nb, struct trace_buffer, flush_nb);
+
+ ring_buffer_record_off(buffer);
+ arch_ring_buffer_flush_range(buffer->range_addr_start, buffer->range_addr_end);
+ return NOTIFY_DONE;
+}
+
static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
- struct lock_class_key *key)
+ unsigned long scratch_size,
+ struct lock_class_key *key,
+ struct ring_buffer_remote *remote)
{
- struct trace_buffer *buffer;
+ struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
int subbuf_size;
int bsize;
@@ -2329,7 +2555,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
return NULL;
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
- goto fail_free_buffer;
+ return NULL;
buffer->subbuf_order = order;
subbuf_size = (PAGE_SIZE << order);
@@ -2353,12 +2579,27 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ cpu = raw_smp_processor_id();
+
/* If start/end are specified, then that overrides size */
if (start && end) {
+ unsigned long buffers_start;
unsigned long ptr;
int n;
- size = end - start;
+ /* Make sure that start is word aligned */
+ start = ALIGN(start, sizeof(long));
+
+ /* scratch_size needs to be aligned too */
+ scratch_size = ALIGN(scratch_size, sizeof(long));
+
+ /* Subtract the buffer meta data and word aligned */
+ buffers_start = start + sizeof(struct ring_buffer_cpu_meta);
+ buffers_start = ALIGN(buffers_start, sizeof(long));
+ buffers_start += scratch_size;
+
+ /* Calculate the size for the per CPU data */
+ size = end - buffers_start;
size = size / nr_cpu_ids;
/*
@@ -2368,7 +2609,7 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
* needed, plus account for the integer array index that
* will be appended to the meta data.
*/
- nr_pages = (size - sizeof(struct ring_buffer_meta)) /
+ nr_pages = (size - sizeof(struct ring_buffer_cpu_meta)) /
(subbuf_size + sizeof(int));
/* Need at least two pages plus the reader page */
if (nr_pages < 3)
@@ -2376,8 +2617,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
again:
/* Make sure that the size fits aligned */
- for (n = 0, ptr = start; n < nr_cpu_ids; n++) {
- ptr += sizeof(struct ring_buffer_meta) +
+ for (n = 0, ptr = buffers_start; n < nr_cpu_ids; n++) {
+ ptr += sizeof(struct ring_buffer_cpu_meta) +
sizeof(int) * nr_pages;
ptr = ALIGN(ptr, subbuf_size);
ptr += subbuf_size * nr_pages;
@@ -2394,7 +2635,16 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_start = start;
buffer->range_addr_end = end;
- rb_range_meta_init(buffer, nr_pages);
+ rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else if (remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu);
+
+ buffer->remote = remote;
+ /* The writer is remote. This ring-buffer is read-only */
+ atomic_inc(&buffer->record_disabled);
+ nr_pages = desc->nr_page_va - 1;
+ if (nr_pages < 2)
+ goto fail_free_buffers;
} else {
/* need at least two pages */
@@ -2403,7 +2653,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
nr_pages = 2;
}
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
@@ -2415,7 +2664,13 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
mutex_init(&buffer->mutex);
- return buffer;
+ /* Persistent ring buffer needs to flush cache before reboot. */
+ if (start && end) {
+ buffer->flush_nb.notifier_call = rb_flush_buffer_cb;
+ atomic_notifier_chain_register(&panic_notifier_list, &buffer->flush_nb);
+ }
+
+ return_ptr(buffer);
fail_free_buffers:
for_each_buffer_cpu(buffer, cpu) {
@@ -2427,8 +2682,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
fail_free_cpumask:
free_cpumask_var(buffer->cpumask);
- fail_free_buffer:
- kfree(buffer);
return NULL;
}
@@ -2447,7 +2700,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0,key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2459,6 +2712,7 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
* @order: sub-buffer order
* @start: start of allocated range
* @range_size: size of allocated range
+ * @scratch_size: size of scratch area (for preallocated memory buffers)
* @key: ring buffer reader_lock_key.
*
* Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -2469,32 +2723,40 @@ EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long range_size,
+ unsigned long scratch_size,
struct lock_class_key *key)
{
- return alloc_buffer(size, flags, order, start, start + range_size, key);
+ return alloc_buffer(size, flags, order, start, start + range_size,
+ scratch_size, key, NULL);
}
/**
- * ring_buffer_last_boot_delta - return the delta offset from last boot
- * @buffer: The buffer to return the delta from
- * @text: Return text delta
- * @data: Return data delta
- *
- * Returns: The true if the delta is non zero
+ * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote
+ * @remote: Contains a description of the ring-buffer pages and remote callbacks.
+ * @key: ring buffer reader_lock_key.
*/
-bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
- long *data)
+struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key)
{
- if (!buffer)
- return false;
+ return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote);
+}
- if (!buffer->last_text_delta)
- return false;
+void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
+{
+ struct ring_buffer_meta *meta;
+ void *ptr;
- *text = buffer->last_text_delta;
- *data = buffer->last_data_delta;
+ if (!buffer || !buffer->meta)
+ return NULL;
- return true;
+ meta = buffer->meta;
+
+ ptr = (void *)ALIGN((unsigned long)meta + sizeof(*meta), sizeof(long));
+
+ if (size)
+ *size = (void *)meta + meta->buffers_offset - ptr;
+
+ return ptr;
}
/**
@@ -2506,6 +2768,9 @@ ring_buffer_free(struct trace_buffer *buffer)
{
int cpu;
+ if (buffer->range_addr_start && buffer->range_addr_end)
+ atomic_notifier_chain_unregister(&panic_notifier_list, &buffer->flush_nb);
+
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
irq_work_sync(&buffer->irq_work.work);
@@ -2794,6 +3059,12 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
if (nr_pages < 2)
nr_pages = 2;
+ /*
+ * Keep CPUs from coming online while resizing to synchronize
+ * with new per CPU buffers being created.
+ */
+ guard(cpus_read_lock)();
+
/* prevent another thread from changing buffer sizes */
mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resizing);
@@ -2838,7 +3109,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cond_resched();
}
- cpus_read_lock();
/*
* Fire off all the required work handlers
* We can't schedule on offline CPUs, but it's not necessary
@@ -2878,7 +3148,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
cpu_buffer->nr_pages_to_update = 0;
}
- cpus_read_unlock();
} else {
cpu_buffer = buffer->buffers[cpu_id];
@@ -2906,8 +3175,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
goto out_err;
}
- cpus_read_lock();
-
/* Can't run something on an offline CPU. */
if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
@@ -2926,7 +3193,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
}
cpu_buffer->nr_pages_to_update = 0;
- cpus_read_unlock();
}
out:
@@ -2971,6 +3237,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
list) {
list_del_init(&bpage->list);
free_buffer_page(bpage);
+
+ cond_resched();
}
}
out_err_unlock:
@@ -3105,7 +3373,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
}
/* Return the index into the sub-buffers for a given sub-buffer */
-static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
+static int rb_meta_subbuf_idx(struct ring_buffer_cpu_meta *meta, void *subbuf)
{
void *subbuf_array;
@@ -3117,7 +3385,7 @@ static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf)
static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *next_page)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
unsigned long old_head = (unsigned long)next_page->page;
unsigned long new_head;
@@ -3134,7 +3402,7 @@ static void rb_update_meta_head(struct ring_buffer_per_cpu *cpu_buffer,
static void rb_update_meta_reader(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *reader)
{
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
void *old_reader = cpu_buffer->reader_page->page;
void *new_reader = reader->page;
int id;
@@ -3763,7 +4031,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->commit_buffer = (unsigned long)cpu_buffer->commit_page->page;
}
/* add barrier to keep gcc from optimizing too much */
@@ -3845,19 +4113,36 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
rb_end_commit(cpu_buffer);
}
+static bool
+rb_irq_work_queue(struct rb_irq_work *irq_work)
+{
+ int cpu;
+
+ /* irq_work_queue_on() is not NMI-safe */
+ if (unlikely(in_nmi()))
+ return irq_work_queue(&irq_work->work);
+
+ /*
+ * If CPU isolation is not active, cpu is always the current
+ * CPU, and the following is equivallent to irq_work_queue().
+ */
+ cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
+ return irq_work_queue_on(&irq_work->work, cpu);
+}
+
static __always_inline void
rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
{
if (buffer->irq_work.waiters_pending) {
buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
+ rb_irq_work_queue(&buffer->irq_work);
}
if (cpu_buffer->irq_work.waiters_pending) {
cpu_buffer->irq_work.waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work);
}
if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
@@ -3877,7 +4162,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->irq_work.wakeup_full = true;
cpu_buffer->irq_work.full_waiters_pending = false;
/* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
+ rb_irq_work_queue(&cpu_buffer->irq_work);
}
#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION
@@ -4066,7 +4351,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static const char *show_irq_str(int bits)
{
- const char *type[] = {
+ static const char * type[] = {
".", // 0
"s", // 1
"h", // 2
@@ -4244,18 +4529,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
if (ret < 0) {
if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
+ cpu_buffer->cpu, ts, delta,
+ cpu_buffer->buffer->clock);
goto out;
}
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS",
cpu_buffer->cpu,
ts + info->delta, info->ts, info->delta,
info->before, info->after,
- full ? " (full)" : "", show_interrupt_level());
+ full ? " (full)" : "", show_interrupt_level(),
+ cpu_buffer->buffer->clock);
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -4629,10 +4916,7 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
rb_decrement_entry(cpu_buffer, event);
- if (rb_try_to_discard(cpu_buffer, event))
- goto out;
-
- out:
+ rb_try_to_discard(cpu_buffer, event);
rb_end_commit(cpu_buffer);
trace_recursive_unlock(cpu_buffer);
@@ -4665,26 +4949,26 @@ int ring_buffer_write(struct trace_buffer *buffer,
int ret = -EBUSY;
int cpu;
- preempt_disable_notrace();
+ guard(preempt_notrace)();
if (atomic_read(&buffer->record_disabled))
- goto out;
+ return -EBUSY;
cpu = raw_smp_processor_id();
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -EBUSY;
cpu_buffer = buffer->buffers[cpu];
if (atomic_read(&cpu_buffer->record_disabled))
- goto out;
+ return -EBUSY;
if (length > buffer->max_data_size)
- goto out;
+ return -EBUSY;
if (unlikely(trace_recursive_lock(cpu_buffer)))
- goto out;
+ return -EBUSY;
event = rb_reserve_next_event(buffer, cpu_buffer, length);
if (!event)
@@ -4702,10 +4986,6 @@ int ring_buffer_write(struct trace_buffer *buffer,
out_unlock:
trace_recursive_unlock(cpu_buffer);
-
- out:
- preempt_enable_notrace();
-
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
@@ -4830,6 +5110,24 @@ bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
}
/**
+ * ring_buffer_record_is_on_cpu - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ * @cpu: The CPU to test if the ring buffer can write too
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes
+ * for a particular CPU.
+ */
+bool ring_buffer_record_is_on_cpu(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ return ring_buffer_record_is_set_on(buffer) &&
+ !atomic_read(&cpu_buffer->record_disabled);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
@@ -5072,14 +5370,66 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
+ local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
+ local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched));
+ local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost));
+
+ return rb_num_of_entries(cpu_buffer);
+}
+
+static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *next, *orig;
+ int retry = 3;
+
+ orig = next = cpu_buffer->head_page;
+ rb_inc_page(&next);
+
+ /* Run after the writer */
+ while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) {
+ rb_inc_page(&next);
+
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+ rb_inc_page(&cpu_buffer->head_page);
+ rb_set_list_to_head(cpu_buffer->head_page->list.prev);
+
+ if (cpu_buffer->head_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+
+ orig = cpu_buffer->commit_page = cpu_buffer->head_page;
+ retry = 3;
+
+ while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) {
+ rb_inc_page(&next);
+ rb_inc_page(&cpu_buffer->commit_page);
+
+ if (cpu_buffer->commit_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+}
+
static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ if (cpu_buffer->remote) {
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_update_remote_head(cpu_buffer);
+ }
+
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
iter->next_event = iter->head;
+ iter->missed_events = 0;
iter->cache_reader_page = iter->head_page;
iter->cache_read = cpu_buffer->read;
@@ -5226,7 +5576,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
}
static struct buffer_page *
-rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last;
+
+ if (!rb_read_remote_meta_page(cpu_buffer))
+ return NULL;
+
+ /* More to read on the reader page */
+ if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) {
+ if (!cpu_buffer->reader_page->read)
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ return cpu_buffer->reader_page;
+ }
+
+ prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu,
+ cpu_buffer->remote->priv));
+ /* nr_pages doesn't include the reader page */
+ if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages))
+ return NULL;
+
+ new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(prev_reader == new_reader);
+
+ prev_head = new_reader; /* New reader was also the previous head */
+ new_head = prev_head;
+ rb_inc_page(&new_head);
+ last = prev_head;
+ rb_dec_page(&last);
+
+ /* Clear the old HEAD flag */
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+
+ prev_reader->list.next = prev_head->list.next;
+ prev_reader->list.prev = prev_head->list.prev;
+
+ /* Swap prev_reader with new_reader */
+ last->list.next = &prev_reader->list;
+ new_head->list.prev = &prev_reader->list;
+
+ new_reader->list.prev = &new_reader->list;
+ new_reader->list.next = &new_head->list;
+
+ /* Reactivate the HEAD flag */
+ rb_set_list_to_head(&last->list);
+
+ cpu_buffer->head_page = new_head;
+ cpu_buffer->reader_page = new_reader;
+ cpu_buffer->pages = &new_head->list;
+ cpu_buffer->read_stamp = new_reader->page->time_stamp;
+ cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
+
+ return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL;
+}
+
+static struct buffer_page *
+__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
@@ -5275,7 +5683,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->real_end = 0;
spin:
@@ -5318,7 +5725,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* moving it. The page before the header page has the
* flag bit '1' set if it is pointing to the page we want.
* but if the writer is in the process of moving it
- * than it will be '2' or already moved '0'.
+ * then it will be '2' or already moved '0'.
*/
ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
@@ -5397,6 +5804,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
return reader;
}
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) :
+ __rb_get_reader_page(cpu_buffer);
+}
+
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
@@ -5695,10 +6109,7 @@ ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
*/
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
{
- bool ret = iter->missed_events != 0;
-
- iter->missed_events = 0;
- return ret;
+ return iter->missed_events != 0;
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
@@ -5779,24 +6190,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
* @flags: gfp flags to use for memory allocation
*
- * This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer resizing
- * is disabled, and the iterator pointer is returned to the caller.
- *
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
*
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -5804,7 +6211,7 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- iter = kzalloc(sizeof(*iter), flags);
+ iter = kzalloc_obj(*iter, flags);
if (!iter)
return NULL;
@@ -5822,51 +6229,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
atomic_inc(&cpu_buffer->resize_disabled);
- return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized. Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags;
-
- if (!iter)
- return;
-
- cpu_buffer = iter->cpu_buffer;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -5903,7 +6271,7 @@ void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
unsigned long flags;
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-
+ iter->missed_events = 0;
rb_advance_iter(iter);
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5947,6 +6315,39 @@ static void rb_clear_buffer_page(struct buffer_page *page)
page->read = 0;
}
+/*
+ * When the buffer is memory mapped to user space, each sub buffer
+ * has a unique id that is used by the meta data to tell the user
+ * where the current reader page is.
+ *
+ * For a normal allocated ring buffer, the id is saved in the buffer page
+ * id field, and updated via this function.
+ *
+ * But for a fixed memory mapped buffer, the id is already assigned for
+ * fixed memory ordering in the memory layout and can not be used. Instead
+ * the index of where the page lies in the memory layout is used.
+ *
+ * For the normal pages, set the buffer page id with the passed in @id
+ * value and return that.
+ *
+ * For fixed memory mapped pages, get the page index in the memory layout
+ * and return that as the id.
+ */
+static int rb_page_id(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page *bpage, int id)
+{
+ /*
+ * For boot buffers, the id is the index,
+ * otherwise, set the buffer page with this id
+ */
+ if (cpu_buffer->ring_meta)
+ id = rb_meta_subbuf_idx(cpu_buffer->ring_meta, bpage->page);
+ else
+ bpage->id = id;
+
+ return id;
+}
+
static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
@@ -5955,15 +6356,19 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
return;
meta->reader.read = cpu_buffer->reader_page->read;
- meta->reader.id = cpu_buffer->reader_page->id;
+ meta->reader.id = rb_page_id(cpu_buffer, cpu_buffer->reader_page,
+ cpu_buffer->reader_page->id);
+
meta->reader.lost_events = cpu_buffer->lost_events;
meta->entries = local_read(&cpu_buffer->entries);
meta->overrun = local_read(&cpu_buffer->overrun);
meta->read = cpu_buffer->read;
+ meta->pages_lost = local_read(&cpu_buffer->pages_lost);
+ meta->pages_touched = local_read(&cpu_buffer->pages_touched);
/* Some archs do not have data cache coherency between kernel and user-space */
- flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+ flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
}
static void
@@ -5971,6 +6376,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *page;
+ if (cpu_buffer->remote) {
+ if (!cpu_buffer->remote->reset)
+ return;
+
+ cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv);
+ rb_read_remote_meta_page(cpu_buffer);
+
+ /* Read related values, not covered by the meta-page */
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->reader_page->read = 0;
+
+ return;
+ }
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
@@ -6016,7 +6438,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
if (cpu_buffer->mapped) {
rb_update_meta_page(cpu_buffer);
if (cpu_buffer->ring_meta) {
- struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+ struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
meta->commit_buffer = meta->head_buffer;
}
}
@@ -6025,21 +6447,16 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
/* Must have disabled the cpu buffer then done a synchronize_rcu */
static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
- goto out;
+ return;
arch_spin_lock(&cpu_buffer->lock);
rb_reset_cpu(cpu_buffer);
arch_spin_unlock(&cpu_buffer->lock);
-
- out:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
/**
@@ -6050,7 +6467,6 @@ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
- struct ring_buffer_meta *meta;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
@@ -6069,11 +6485,6 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
- /* Make sure persistent meta now uses this buffer's addresses */
- meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
- if (meta)
- rb_meta_init_text_addr(meta);
-
mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
@@ -6088,7 +6499,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct ring_buffer_meta *meta;
int cpu;
/* prevent another thread from changing buffer sizes */
@@ -6116,11 +6526,6 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
reset_disabled_cpu_buffer(cpu_buffer);
- /* Make sure persistent meta now uses this buffer's addresses */
- meta = rb_range_meta(buffer, 0, cpu_buffer->cpu);
- if (meta)
- rb_meta_init_text_addr(meta);
-
atomic_dec(&cpu_buffer->record_disabled);
atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
@@ -6218,6 +6623,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ if (rb_read_remote_meta_page(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ /*
+ * Make sure all the ring buffers are up to date before we start reading
+ * them.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (rb_num_of_entries(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -6239,37 +6684,33 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
!cpumask_test_cpu(cpu, buffer_b->cpumask))
- goto out;
+ return -EINVAL;
cpu_buffer_a = buffer_a->buffers[cpu];
cpu_buffer_b = buffer_b->buffers[cpu];
/* It's up to the callers to not try to swap mapped buffers */
- if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
- ret = -EBUSY;
- goto out;
- }
+ if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped))
+ return -EBUSY;
/* At least make sure the two buffers are somewhat the same */
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
- goto out;
+ return -EINVAL;
if (buffer_a->subbuf_order != buffer_b->subbuf_order)
- goto out;
-
- ret = -EAGAIN;
+ return -EINVAL;
if (atomic_read(&buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_a->record_disabled))
- goto out;
+ return -EAGAIN;
if (atomic_read(&cpu_buffer_b->record_disabled))
- goto out;
+ return -EAGAIN;
/*
* We can't do a synchronize_rcu here because this
@@ -6306,7 +6747,6 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
out_dec:
atomic_dec(&cpu_buffer_a->record_disabled);
atomic_dec(&cpu_buffer_b->record_disabled);
-out:
return ret;
}
EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -6334,12 +6774,11 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
- struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
- bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ bpage = kzalloc_obj(*bpage);
if (!bpage)
return ERR_PTR(-ENOMEM);
@@ -6356,22 +6795,16 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage->data)
- goto out;
-
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY | __GFP_COMP | __GFP_ZERO,
- cpu_buffer->buffer->subbuf_order);
- if (!page) {
- kfree(bpage);
- return ERR_PTR(-ENOMEM);
+ if (bpage->data) {
+ rb_init_page(bpage->data);
+ } else {
+ bpage->data = alloc_cpu_data(cpu, cpu_buffer->buffer->subbuf_order);
+ if (!bpage->data) {
+ kfree(bpage);
+ return ERR_PTR(-ENOMEM);
+ }
}
- bpage->data = page_address(page);
-
- out:
- rb_init_page(bpage->data);
-
return bpage;
}
EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
@@ -6465,38 +6898,38 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
struct buffer_data_page *bpage;
struct buffer_page *reader;
unsigned long missed_events;
- unsigned long flags;
unsigned int commit;
unsigned int read;
u64 save_timestamp;
- int ret = -1;
+ bool force_memcpy;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- goto out;
+ return -1;
/*
* If len is not big enough to hold the page header, then
* we can not copy anything.
*/
if (len <= BUF_PAGE_HDR_SIZE)
- goto out;
+ return -1;
len -= BUF_PAGE_HDR_SIZE;
if (!data_page || !data_page->data)
- goto out;
+ return -1;
+
if (data_page->order != buffer->subbuf_order)
- goto out;
+ return -1;
bpage = data_page->data;
if (!bpage)
- goto out;
+ return -1;
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
- goto out_unlock;
+ return -1;
event = rb_reader_event(cpu_buffer);
@@ -6506,6 +6939,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
+ force_memcpy = cpu_buffer->mapped || cpu_buffer->remote;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -6515,7 +6950,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
- cpu_buffer->mapped) {
+ force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -6530,7 +6965,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (full &&
(!read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page))
- goto out_unlock;
+ return -1;
if (len > (commit - read))
len = (commit - read);
@@ -6539,7 +6974,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
size = rb_event_ts_length(event);
if (len < size)
- goto out_unlock;
+ return -1;
/* save the current timestamp, since the user will need it */
save_timestamp = cpu_buffer->read_stamp;
@@ -6597,7 +7032,6 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (reader->real_end)
local_set(&bpage->commit, reader->real_end);
}
- ret = read;
cpu_buffer->lost_events = 0;
@@ -6624,11 +7058,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
if (commit < buffer->subbuf_size)
memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
- out_unlock:
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
-
- out:
- return ret;
+ return read;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
@@ -6721,7 +7151,7 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
old_size = buffer->subbuf_size;
/* prevent another thread from changing buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
atomic_inc(&buffer->record_disabled);
/* Make sure all commits have finished */
@@ -6826,7 +7256,6 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
}
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
return 0;
@@ -6835,7 +7264,6 @@ error:
buffer->subbuf_size = old_size;
atomic_dec(&buffer->record_disabled);
- mutex_unlock(&buffer->mutex);
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -6878,29 +7306,35 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
}
static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long *subbuf_ids)
+ struct buffer_page **subbuf_ids)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
struct buffer_page *first_subbuf, *subbuf;
+ int cnt = 0;
int id = 0;
- subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
- cpu_buffer->reader_page->id = id++;
+ id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
+ subbuf_ids[id++] = cpu_buffer->reader_page;
+ cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
do {
+ id = rb_page_id(cpu_buffer, subbuf, id);
+
if (WARN_ON(id >= nr_subbufs))
break;
- subbuf_ids[id] = (unsigned long)subbuf->page;
- subbuf->id = id;
+ subbuf_ids[id] = subbuf;
rb_inc_page(&subbuf);
id++;
+ cnt++;
} while (subbuf != first_subbuf);
- /* install subbuf ID to kern VA translation */
+ WARN_ON(cnt != nr_subbufs);
+
+ /* install subbuf ID to bpage translation */
cpu_buffer->subbuf_ids = subbuf_ids;
meta->meta_struct_len = sizeof(*meta);
@@ -6991,7 +7425,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
{
unsigned long nr_subbufs, nr_pages, nr_vma_pages, pgoff = vma->vm_pgoff;
unsigned int subbuf_pages, subbuf_order;
- struct page **pages;
+ struct page **pages __free(kfree) = NULL;
int p = 0, s = 0;
int err;
@@ -7028,7 +7462,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
nr_pages = nr_vma_pages;
- pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+ pages = kzalloc_objs(*pages, nr_pages);
if (!pages)
return -ENOMEM;
@@ -7056,15 +7490,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
+ struct buffer_page *subbuf;
struct page *page;
int off = 0;
- if (WARN_ON_ONCE(s >= nr_subbufs)) {
- err = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(s >= nr_subbufs))
+ return -EINVAL;
- page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ subbuf = cpu_buffer->subbuf_ids[s];
+ page = virt_to_page((void *)subbuf->page);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
@@ -7077,9 +7511,6 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
-out:
- kfree(pages);
-
return err;
}
#else
@@ -7094,37 +7525,36 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags, *subbuf_ids;
- int err = 0;
+ struct buffer_page **subbuf_ids;
+ unsigned long flags;
+ int err;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote)
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
- mutex_unlock(&cpu_buffer->mapping_lock);
return err;
}
/* prevent another thread from changing buffer/sub-buffer sizes */
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
err = rb_alloc_meta_page(cpu_buffer);
if (err)
- goto unlock;
+ return err;
- /* subbuf_ids include the reader while nr_pages does not */
+ /* subbuf_ids includes the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
- err = -ENOMEM;
- goto unlock;
+ return -ENOMEM;
}
atomic_inc(&cpu_buffer->resize_disabled);
@@ -7152,35 +7582,50 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
atomic_dec(&cpu_buffer->resize_disabled);
}
-unlock:
- mutex_unlock(&buffer->mutex);
- mutex_unlock(&cpu_buffer->mapping_lock);
-
return err;
}
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask)))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(mutex)(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->user_mapped)
+ __rb_inc_dec_mapped(cpu_buffer, true);
+ else
+ WARN(1, "Unexpected buffer stat, it should be mapped");
+}
+
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
- int err = 0;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
- mutex_lock(&cpu_buffer->mapping_lock);
+ guard(mutex)(&cpu_buffer->mapping_lock);
if (!cpu_buffer->user_mapped) {
- err = -ENODEV;
- goto out;
+ return -ENODEV;
} else if (cpu_buffer->user_mapped > 1) {
__rb_inc_dec_mapped(cpu_buffer, false);
- goto out;
+ return 0;
}
- mutex_lock(&buffer->mutex);
+ guard(mutex)(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
/* This is the last user space mapping */
@@ -7195,12 +7640,7 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
rb_free_meta_page(cpu_buffer);
atomic_dec(&cpu_buffer->resize_disabled);
- mutex_unlock(&buffer->mutex);
-
-out:
- mutex_unlock(&cpu_buffer->mapping_lock);
-
- return err;
+ return 0;
}
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
@@ -7234,6 +7674,10 @@ consume:
goto out;
}
+ /* Did the reader catch up with the writer? */
+ if (cpu_buffer->reader_page == cpu_buffer->commit_page)
+ goto out;
+
reader = rb_get_reader_page(cpu_buffer);
if (WARN_ON(!reader))
goto out;
@@ -7241,8 +7685,8 @@ consume:
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
- if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
- if (missed_events) {
+ if (missed_events) {
+ if (cpu_buffer->reader_page != cpu_buffer->commit_page) {
struct buffer_data_page *bpage = reader->page;
unsigned int commit;
/*
@@ -7263,13 +7707,23 @@ consume:
local_add(RB_MISSED_STORED, &bpage->commit);
}
local_add(RB_MISSED_EVENTS, &bpage->commit);
+ } else if (!WARN_ONCE(cpu_buffer->reader_page == cpu_buffer->tail_page,
+ "Reader on commit with %ld missed events",
+ missed_events)) {
+ /*
+ * There shouldn't be any missed events if the tail_page
+ * is on the reader page. But if the tail page is not on the
+ * reader page and the commit_page is, that would mean that
+ * there's a commit_overrun (an interrupt preempted an
+ * addition of an event and then filled the buffer
+ * with new events). In this case it's not an
+ * error, but it should still be reported.
+ *
+ * TODO: Add missed events to the page for user space to know.
+ */
+ pr_info("Ring buffer [%d] commit overrun lost %ld events at timestamp:%lld\n",
+ cpu, missed_events, cpu_buffer->reader_page->page->time_stamp);
}
- } else {
- /*
- * There really shouldn't be any missed events if the commit
- * is on the reader page.
- */
- WARN_ON_ONCE(missed_events);
}
cpu_buffer->lost_events = 0;
@@ -7278,7 +7732,8 @@ consume:
out:
/* Some archs do not have data cache coherency between kernel and user-space */
- flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+ flush_kernel_vmap_range(cpu_buffer->reader_page->page,
+ buffer->subbuf_size + BUF_PAGE_HDR_SIZE);
rb_update_meta_page(cpu_buffer);
@@ -7288,6 +7743,12 @@ out:
return 0;
}
+static void rb_cpu_sync(void *data)
+{
+ /* Not really needed, but documents what is happening */
+ smp_rmb();
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -7326,7 +7787,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
cpu);
return -ENOMEM;
}
- smp_wmb();
+
+ /*
+ * Ensure trace_buffer readers observe the newly allocated
+ * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+ * read barrier for all readers, send an IPI.
+ */
+ if (unlikely(system_state == SYSTEM_RUNNING)) {
+ on_each_cpu(rb_cpu_sync, NULL, 1);
+ /* Not really needed, but documents what is happening */
+ smp_wmb();
+ }
+
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}
@@ -7411,9 +7883,9 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested)
/* Ignore dropped events before test starts. */
if (started) {
if (nested)
- data->bytes_dropped += len;
- else
data->bytes_dropped_nested += len;
+ else
+ data->bytes_dropped += len;
}
return len;
}
@@ -7535,7 +8007,7 @@ static __init int test_ringbuffer(void)
/*
* Show buffer is enabled before setting rb_test_started.
* Yes there's a small race window where events could be
- * dropped and the thread wont catch it. But when a ring
+ * dropped and the thread won't catch it. But when a ring
* buffer gets enabled, there will always be some kind of
* delay before other CPUs see it. Thus, we don't care about
* those dropped events. We care about events dropped after
@@ -7545,7 +8017,7 @@ static __init int test_ringbuffer(void)
rb_test_started = true;
set_current_state(TASK_INTERRUPTIBLE);
- /* Just run for 10 seconds */;
+ /* Just run for 10 seconds */
schedule_timeout(10 * HZ);
kthread_stop(rb_hammer);