summaryrefslogtreecommitdiff
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c410
1 files changed, 351 insertions, 59 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f16f053ef77d..cef49f8871d2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
*/
+#include <linux/ring_buffer_types.h>
#include <linux/sched/isolation.h>
#include <linux/trace_recursion.h>
#include <linux/trace_events.h>
@@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
/* Used for individual buffers (after the counter) */
#define RB_BUFFER_OFF (1 << 20)
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
-#define RB_ALIGNMENT 4U
-#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
-
-#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS
-# define RB_FORCE_8BYTE_ALIGNMENT 0
-# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
-#else
-# define RB_FORCE_8BYTE_ALIGNMENT 1
-# define RB_ARCH_ALIGNMENT 8U
-#endif
-
-#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT)
-
/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_online_buffer_cpu(buffer, cpu) \
for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
-#define TS_SHIFT 27
-#define TS_MASK ((1ULL << TS_SHIFT) - 1)
-#define TS_DELTA_TEST (~TS_MASK)
-
static u64 rb_event_time_stamp(struct ring_buffer_event *event)
{
u64 ts;
@@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event)
#define RB_MISSED_MASK (3 << 30)
-struct buffer_data_page {
- u64 time_stamp; /* page time stamp */
- local_t commit; /* write committed index */
- unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
-};
-
struct buffer_data_read_page {
unsigned order; /* order of the page */
struct buffer_data_page *data; /* actual data, stored in this page */
@@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order)
return dpage;
}
-/*
- * We need to fit the time_stamp delta into 27 bits.
- */
-static inline bool test_time_stamp(u64 delta)
-{
- return !!(delta & TS_DELTA_TEST);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -555,10 +521,12 @@ struct ring_buffer_per_cpu {
unsigned int mapped;
unsigned int user_mapped; /* user space mapping */
struct mutex mapping_lock;
- unsigned long *subbuf_ids; /* ID to subbuf VA */
+ struct buffer_page **subbuf_ids; /* ID to subbuf VA */
struct trace_buffer_meta *meta_page;
struct ring_buffer_cpu_meta *ring_meta;
+ struct ring_buffer_remote *remote;
+
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
@@ -581,6 +549,8 @@ struct trace_buffer {
struct ring_buffer_per_cpu **buffers;
+ struct ring_buffer_remote *remote;
+
struct hlist_node node;
u64 (*clock)(void);
@@ -627,16 +597,17 @@ int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq
(unsigned int)sizeof(field.commit),
(unsigned int)is_signed_type(long));
- trace_seq_printf(s, "\tfield: int overwrite;\t"
+ trace_seq_printf(s, "\tfield: char overwrite;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), commit),
1,
- (unsigned int)is_signed_type(long));
+ (unsigned int)is_signed_type(char));
trace_seq_printf(s, "\tfield: char data;\t"
"offset:%u;\tsize:%u;\tsigned:%u;\n",
(unsigned int)offsetof(typeof(field), data),
- (unsigned int)buffer->subbuf_size,
+ (unsigned int)(buffer ? buffer->subbuf_size :
+ PAGE_SIZE - BUF_PAGE_HDR_SIZE),
(unsigned int)is_signed_type(char));
return !trace_seq_has_overflowed(s);
@@ -2053,7 +2024,7 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
- local_set(&cpu_buffer->head_page->entries, ret);
+ local_set(&head_page->entries, ret);
if (head_page == cpu_buffer->commit_page)
break;
@@ -2238,6 +2209,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
+static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu)
+{
+ struct ring_buffer_desc *desc, *end;
+ size_t len;
+ int i;
+
+ if (!trace_desc)
+ return NULL;
+
+ if (cpu >= trace_desc->nr_cpus)
+ return NULL;
+
+ end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len);
+ desc = __first_ring_buffer_desc(trace_desc);
+ len = struct_size(desc, page_va, desc->nr_page_va);
+ desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu));
+
+ if (desc < end && desc->cpu == cpu)
+ return desc;
+
+ /* Missing CPUs, need to linear search */
+ for_each_ring_buffer_desc(desc, i, trace_desc) {
+ if (desc->cpu == cpu)
+ return desc;
+ }
+
+ return NULL;
+}
+
+static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, unsigned int page_id)
+{
+ return page_id >= desc->nr_page_va ? NULL : (void *)desc->page_va[page_id];
+}
+
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
{
@@ -2245,6 +2250,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_cpu_meta *meta = NULL;
struct buffer_page *bpage, *tmp;
bool user_thread = current->mm != NULL;
+ struct ring_buffer_desc *desc = NULL;
long i;
/*
@@ -2273,6 +2279,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
if (buffer->range_addr_start)
meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu);
+ if (buffer->remote) {
+ desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu);
+ if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1)))
+ return -EINVAL;
+ }
+
for (i = 0; i < nr_pages; i++) {
bpage = alloc_cpu_page(cpu_buffer->cpu);
@@ -2297,6 +2309,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
bpage->id = i + 1;
+ } else if (desc) {
+ void *p = ring_buffer_desc_page(desc, i + 1);
+
+ if (WARN_ON(!p))
+ goto free_pages;
+
+ bpage->page = p;
+ bpage->range = 1; /* bpage->page can't be freed */
+ bpage->id = i + 1;
+ cpu_buffer->subbuf_ids[i + 1] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu_buffer->cpu, order);
@@ -2394,6 +2416,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
if (cpu_buffer->ring_meta->head_buffer)
rb_meta_buffer_update(cpu_buffer, bpage);
bpage->range = 1;
+ } else if (buffer->remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu);
+
+ if (!desc)
+ goto fail_free_reader;
+
+ cpu_buffer->remote = buffer->remote;
+ cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va;
+ cpu_buffer->nr_pages = nr_pages;
+ cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1,
+ sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL);
+ if (!cpu_buffer->subbuf_ids)
+ goto fail_free_reader;
+
+ /* Remote buffers are read-only and immutable */
+ atomic_inc(&cpu_buffer->record_disabled);
+ atomic_inc(&cpu_buffer->resize_disabled);
+
+ bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id);
+ if (!bpage->page)
+ goto fail_free_reader;
+
+ bpage->range = 1;
+ cpu_buffer->subbuf_ids[0] = bpage;
} else {
int order = cpu_buffer->buffer->subbuf_order;
bpage->page = alloc_cpu_data(cpu, order);
@@ -2453,6 +2499,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
irq_work_sync(&cpu_buffer->irq_work.work);
+ if (cpu_buffer->remote)
+ kfree(cpu_buffer->subbuf_ids);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -2475,7 +2524,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
int order, unsigned long start,
unsigned long end,
unsigned long scratch_size,
- struct lock_class_key *key)
+ struct lock_class_key *key,
+ struct ring_buffer_remote *remote)
{
struct trace_buffer *buffer __free(kfree) = NULL;
long nr_pages;
@@ -2515,6 +2565,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
if (!buffer->buffers)
goto fail_free_cpumask;
+ cpu = raw_smp_processor_id();
+
/* If start/end are specified, then that overrides size */
if (start && end) {
unsigned long buffers_start;
@@ -2570,6 +2622,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
buffer->range_addr_end = end;
rb_range_meta_init(buffer, nr_pages, scratch_size);
+ } else if (remote) {
+ struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu);
+
+ buffer->remote = remote;
+ /* The writer is remote. This ring-buffer is read-only */
+ atomic_inc(&buffer->record_disabled);
+ nr_pages = desc->nr_page_va - 1;
+ if (nr_pages < 2)
+ goto fail_free_buffers;
} else {
/* need at least two pages */
@@ -2578,7 +2639,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags,
nr_pages = 2;
}
- cpu = raw_smp_processor_id();
cpumask_set_cpu(cpu, buffer->cpumask);
buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu);
if (!buffer->buffers[cpu])
@@ -2620,7 +2680,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
/* Default buffer page size - one system page */
- return alloc_buffer(size, flags, 0, 0, 0, 0, key);
+ return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL);
}
EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
@@ -2647,7 +2707,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag
struct lock_class_key *key)
{
return alloc_buffer(size, flags, order, start, start + range_size,
- scratch_size, key);
+ scratch_size, key, NULL);
+}
+
+/**
+ * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote
+ * @remote: Contains a description of the ring-buffer pages and remote callbacks.
+ * @key: ring buffer reader_lock_key.
+ */
+struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote,
+ struct lock_class_key *key)
+{
+ return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote);
}
void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size)
@@ -4435,18 +4506,20 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
ret = rb_read_data_buffer(bpage, tail, cpu_buffer->cpu, &ts, &delta);
if (ret < 0) {
if (delta < ts) {
- buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
- cpu_buffer->cpu, ts, delta);
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld clock:%pS\n",
+ cpu_buffer->cpu, ts, delta,
+ cpu_buffer->buffer->clock);
goto out;
}
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\ntrace clock:%pS",
cpu_buffer->cpu,
ts + info->delta, info->ts, info->delta,
info->before, info->after,
- full ? " (full)" : "", show_interrupt_level());
+ full ? " (full)" : "", show_interrupt_level(),
+ cpu_buffer->buffer->clock);
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -5274,10 +5347,61 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
}
EXPORT_SYMBOL_GPL(ring_buffer_overruns);
+static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries));
+ local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun));
+ local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched));
+ local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost));
+
+ return rb_num_of_entries(cpu_buffer);
+}
+
+static void rb_update_remote_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *next, *orig;
+ int retry = 3;
+
+ orig = next = cpu_buffer->head_page;
+ rb_inc_page(&next);
+
+ /* Run after the writer */
+ while (cpu_buffer->head_page->page->time_stamp > next->page->time_stamp) {
+ rb_inc_page(&next);
+
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+ rb_inc_page(&cpu_buffer->head_page);
+ rb_set_list_to_head(cpu_buffer->head_page->list.prev);
+
+ if (cpu_buffer->head_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+
+ orig = cpu_buffer->commit_page = cpu_buffer->head_page;
+ retry = 3;
+
+ while (cpu_buffer->commit_page->page->time_stamp < next->page->time_stamp) {
+ rb_inc_page(&next);
+ rb_inc_page(&cpu_buffer->commit_page);
+
+ if (cpu_buffer->commit_page == orig) {
+ if (WARN_ON_ONCE(!(--retry)))
+ return;
+ }
+ }
+}
+
static void rb_iter_reset(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ if (cpu_buffer->remote) {
+ rb_read_remote_meta_page(cpu_buffer);
+ rb_update_remote_head(cpu_buffer);
+ }
+
/* Iterator usage is expected to have record disabled */
iter->head_page = cpu_buffer->reader_page;
iter->head = cpu_buffer->reader_page->read;
@@ -5428,7 +5552,65 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
}
static struct buffer_page *
-rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *new_reader, *prev_reader, *prev_head, *new_head, *last;
+
+ if (!rb_read_remote_meta_page(cpu_buffer))
+ return NULL;
+
+ /* More to read on the reader page */
+ if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) {
+ if (!cpu_buffer->reader_page->read)
+ cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
+ return cpu_buffer->reader_page;
+ }
+
+ prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu,
+ cpu_buffer->remote->priv));
+ /* nr_pages doesn't include the reader page */
+ if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages))
+ return NULL;
+
+ new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id];
+
+ WARN_ON_ONCE(prev_reader == new_reader);
+
+ prev_head = new_reader; /* New reader was also the previous head */
+ new_head = prev_head;
+ rb_inc_page(&new_head);
+ last = prev_head;
+ rb_dec_page(&last);
+
+ /* Clear the old HEAD flag */
+ rb_list_head_clear(cpu_buffer->head_page->list.prev);
+
+ prev_reader->list.next = prev_head->list.next;
+ prev_reader->list.prev = prev_head->list.prev;
+
+ /* Swap prev_reader with new_reader */
+ last->list.next = &prev_reader->list;
+ new_head->list.prev = &prev_reader->list;
+
+ new_reader->list.prev = &new_reader->list;
+ new_reader->list.next = &new_head->list;
+
+ /* Reactivate the HEAD flag */
+ rb_set_list_to_head(&last->list);
+
+ cpu_buffer->head_page = new_head;
+ cpu_buffer->reader_page = new_reader;
+ cpu_buffer->pages = &new_head->list;
+ cpu_buffer->read_stamp = new_reader->page->time_stamp;
+ cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events;
+
+ return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL;
+}
+
+static struct buffer_page *
+__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
@@ -5598,6 +5780,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
return reader;
}
+static struct buffer_page *
+rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) :
+ __rb_get_reader_page(cpu_buffer);
+}
+
static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_event *event;
@@ -6154,6 +6343,8 @@ static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
meta->entries = local_read(&cpu_buffer->entries);
meta->overrun = local_read(&cpu_buffer->overrun);
meta->read = cpu_buffer->read;
+ meta->pages_lost = local_read(&cpu_buffer->pages_lost);
+ meta->pages_touched = local_read(&cpu_buffer->pages_touched);
/* Some archs do not have data cache coherency between kernel and user-space */
flush_kernel_vmap_range(cpu_buffer->meta_page, PAGE_SIZE);
@@ -6164,6 +6355,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *page;
+ if (cpu_buffer->remote) {
+ if (!cpu_buffer->remote->reset)
+ return;
+
+ cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv);
+ rb_read_remote_meta_page(cpu_buffer);
+
+ /* Read related values, not covered by the meta-page */
+ local_set(&cpu_buffer->pages_read, 0);
+ cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
+ cpu_buffer->last_overrun = 0;
+ cpu_buffer->reader_page->read = 0;
+
+ return;
+ }
+
rb_head_page_deactivate(cpu_buffer);
cpu_buffer->head_page
@@ -6394,6 +6602,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
+int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (cpu != RING_BUFFER_ALL_CPUS) {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ if (rb_read_remote_meta_page(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+
+ return 0;
+ }
+
+ guard(cpus_read_lock)();
+
+ /*
+ * Make sure all the ring buffers are up to date before we start reading
+ * them.
+ */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(raw_spinlock)(&cpu_buffer->reader_lock);
+ rb_read_remote_meta_page(cpu_buffer);
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (rb_num_of_entries(cpu_buffer))
+ rb_wakeups(buffer, cpu_buffer);
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
/**
* ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -6632,6 +6880,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
unsigned int commit;
unsigned int read;
u64 save_timestamp;
+ bool force_memcpy;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return -1;
@@ -6669,6 +6918,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* Check if any events were dropped */
missed_events = cpu_buffer->lost_events;
+ force_memcpy = cpu_buffer->mapped || cpu_buffer->remote;
+
/*
* If this page has been partially read or
* if len is not big enough to read the rest of the page or
@@ -6678,7 +6929,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
*/
if (read || (len < (commit - read)) ||
cpu_buffer->reader_page == cpu_buffer->commit_page ||
- cpu_buffer->mapped) {
+ force_memcpy) {
struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
unsigned int rpos = read;
unsigned int pos = 0;
@@ -7034,7 +7285,7 @@ static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
}
static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long *subbuf_ids)
+ struct buffer_page **subbuf_ids)
{
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
@@ -7043,7 +7294,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
int id = 0;
id = rb_page_id(cpu_buffer, cpu_buffer->reader_page, id);
- subbuf_ids[id++] = (unsigned long)cpu_buffer->reader_page->page;
+ subbuf_ids[id++] = cpu_buffer->reader_page;
cnt++;
first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
@@ -7053,7 +7304,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
if (WARN_ON(id >= nr_subbufs))
break;
- subbuf_ids[id] = (unsigned long)subbuf->page;
+ subbuf_ids[id] = subbuf;
rb_inc_page(&subbuf);
id++;
@@ -7062,7 +7313,7 @@ static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
WARN_ON(cnt != nr_subbufs);
- /* install subbuf ID to kern VA translation */
+ /* install subbuf ID to bpage translation */
cpu_buffer->subbuf_ids = subbuf_ids;
meta->meta_struct_len = sizeof(*meta);
@@ -7218,13 +7469,15 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
+ struct buffer_page *subbuf;
struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs))
return -EINVAL;
- page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ subbuf = cpu_buffer->subbuf_ids[s];
+ page = virt_to_page((void *)subbuf->page);
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
@@ -7251,10 +7504,11 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
struct vm_area_struct *vma)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags, *subbuf_ids;
+ struct buffer_page **subbuf_ids;
+ unsigned long flags;
int err;
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote)
return -EINVAL;
cpu_buffer = buffer->buffers[cpu];
@@ -7275,7 +7529,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
if (err)
return err;
- /* subbuf_ids include the reader while nr_pages does not */
+ /* subbuf_ids includes the reader while nr_pages does not */
subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
if (!subbuf_ids) {
rb_free_meta_page(cpu_buffer);
@@ -7310,6 +7564,27 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
return err;
}
+/*
+ * This is called when a VMA is duplicated (e.g., on fork()) to increment
+ * the user_mapped counter without remapping pages.
+ */
+void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask)))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ guard(mutex)(&cpu_buffer->mapping_lock);
+
+ if (cpu_buffer->user_mapped)
+ __rb_inc_dec_mapped(cpu_buffer, true);
+ else
+ WARN(1, "Unexpected buffer stat, it should be mapped");
+}
+
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
@@ -7447,6 +7722,12 @@ out:
return 0;
}
+static void rb_cpu_sync(void *data)
+{
+ /* Not really needed, but documents what is happening */
+ smp_rmb();
+}
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
@@ -7485,7 +7766,18 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
cpu);
return -ENOMEM;
}
- smp_wmb();
+
+ /*
+ * Ensure trace_buffer readers observe the newly allocated
+ * ring_buffer_per_cpu before they check the cpumask. Instead of using a
+ * read barrier for all readers, send an IPI.
+ */
+ if (unlikely(system_state == SYSTEM_RUNNING)) {
+ on_each_cpu(rb_cpu_sync, NULL, 1);
+ /* Not really needed, but documents what is happening */
+ smp_wmb();
+ }
+
cpumask_set_cpu(cpu, buffer->cpumask);
return 0;
}