From b7facbaec75a20f34c2065121dc423971682f922 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 16 Jul 2014 15:55:29 +0300 Subject: amdkfd: Add IOCTL set definitions of amdkfd - KFD_IOC_GET_VERSION: Retrieves the interface version of amdkfd - KFD_IOC_CREATE_QUEUE: Creates a usermode queue that runs on a specific GPU device - KFD_IOC_DESTROY_QUEUE: Destroys an existing usermode queue - KFD_IOC_SET_MEMORY_POLICY: Sets the memory policy of the default and alternate aperture of the calling process - KFD_IOC_GET_CLOCK_COUNTERS: Retrieves counters (timestamps) of CPU and GPU - KFD_IOC_GET_PROCESS_APERTURES: Retrieves information about process apertures that were initialized during the open() call of the amdkfd device - KFD_IOC_UPDATE_QUEUE: Updates configuration of an existing usermode queue v3: Remove pragma pack and pmc ioctls. Added parameter for doorbell offset and a comment on counters v5: Add define for AQL queues. Fix arguments of Get Version IOCTL Make IOCTL's structures to be the same size on 32/64 bit v6: Change the version of the amdkfd-thunk interface Signed-off-by: Oded Gabbay --- include/uapi/linux/kfd_ioctl.h | 154 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 include/uapi/linux/kfd_ioctl.h (limited to 'include/uapi') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h new file mode 100644 index 000000000000..7acef41fc209 --- /dev/null +++ b/include/uapi/linux/kfd_ioctl.h @@ -0,0 +1,154 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_IOCTL_H_INCLUDED +#define KFD_IOCTL_H_INCLUDED + +#include +#include + +#define KFD_IOCTL_MAJOR_VERSION 1 +#define KFD_IOCTL_MINOR_VERSION 0 + +struct kfd_ioctl_get_version_args { + uint32_t major_version; /* from KFD */ + uint32_t minor_version; /* from KFD */ +}; + +/* For kfd_ioctl_create_queue_args.queue_type. */ +#define KFD_IOC_QUEUE_TYPE_COMPUTE 0 +#define KFD_IOC_QUEUE_TYPE_SDMA 1 +#define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL 2 + +#define KFD_MAX_QUEUE_PERCENTAGE 100 +#define KFD_MAX_QUEUE_PRIORITY 15 + +struct kfd_ioctl_create_queue_args { + uint64_t ring_base_address; /* to KFD */ + uint64_t write_pointer_address; /* from KFD */ + uint64_t read_pointer_address; /* from KFD */ + uint64_t doorbell_offset; /* from KFD */ + + uint32_t ring_size; /* to KFD */ + uint32_t gpu_id; /* to KFD */ + uint32_t queue_type; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ + uint32_t queue_id; /* from KFD */ + + uint64_t eop_buffer_address; /* to KFD */ + uint64_t eop_buffer_size; /* to KFD */ + uint64_t ctx_save_restore_address; /* to KFD */ + uint64_t ctx_save_restore_size; /* to KFD */ +}; + +struct kfd_ioctl_destroy_queue_args { + uint32_t queue_id; /* to KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_update_queue_args { + uint64_t ring_base_address; /* to KFD */ + + uint32_t queue_id; /* to KFD */ + uint32_t ring_size; /* to KFD */ + uint32_t queue_percentage; /* to KFD */ + uint32_t queue_priority; /* to KFD */ +}; + +/* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */ +#define KFD_IOC_CACHE_POLICY_COHERENT 0 +#define KFD_IOC_CACHE_POLICY_NONCOHERENT 1 + +struct kfd_ioctl_set_memory_policy_args { + uint64_t alternate_aperture_base; /* to KFD */ + uint64_t alternate_aperture_size; /* to KFD */ + + uint32_t gpu_id; /* to KFD */ + uint32_t default_policy; /* to KFD */ + uint32_t alternate_policy; /* to KFD */ + uint32_t pad; +}; + +/* + * All counters are monotonic. They are used for profiling of compute jobs. + * The profiling is done by userspace. + * + * In case of GPU reset, the counter should not be affected. + */ + +struct kfd_ioctl_get_clock_counters_args { + uint64_t gpu_clock_counter; /* from KFD */ + uint64_t cpu_clock_counter; /* from KFD */ + uint64_t system_clock_counter; /* from KFD */ + uint64_t system_clock_freq; /* from KFD */ + + uint32_t gpu_id; /* to KFD */ + uint32_t pad; +}; + +#define NUM_OF_SUPPORTED_GPUS 7 + +struct kfd_process_device_apertures { + uint64_t lds_base; /* from KFD */ + uint64_t lds_limit; /* from KFD */ + uint64_t scratch_base; /* from KFD */ + uint64_t scratch_limit; /* from KFD */ + uint64_t gpuvm_base; /* from KFD */ + uint64_t gpuvm_limit; /* from KFD */ + uint32_t gpu_id; /* from KFD */ + uint32_t pad; +}; + +struct kfd_ioctl_get_process_apertures_args { + struct kfd_process_device_apertures + process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ + + /* from KFD, should be in the range [1 - NUM_OF_SUPPORTED_GPUS] */ + uint32_t num_of_nodes; + uint32_t pad; +}; + +#define KFD_IOC_MAGIC 'K' + +#define KFD_IOC_GET_VERSION \ + _IOR(KFD_IOC_MAGIC, 1, struct kfd_ioctl_get_version_args) + +#define KFD_IOC_CREATE_QUEUE \ + _IOWR(KFD_IOC_MAGIC, 2, struct kfd_ioctl_create_queue_args) + +#define KFD_IOC_DESTROY_QUEUE \ + _IOWR(KFD_IOC_MAGIC, 3, struct kfd_ioctl_destroy_queue_args) + +#define KFD_IOC_SET_MEMORY_POLICY \ + _IOW(KFD_IOC_MAGIC, 4, struct kfd_ioctl_set_memory_policy_args) + +#define KFD_IOC_GET_CLOCK_COUNTERS \ + _IOWR(KFD_IOC_MAGIC, 5, struct kfd_ioctl_get_clock_counters_args) + +#define KFD_IOC_GET_PROCESS_APERTURES \ + _IOR(KFD_IOC_MAGIC, 6, struct kfd_ioctl_get_process_apertures_args) + +#define KFD_IOC_UPDATE_QUEUE \ + _IOW(KFD_IOC_MAGIC, 7, struct kfd_ioctl_update_queue_args) + +#endif -- cgit v1.2.3 From f72a113a71ab08c4df8a5f80ab2f8a140feb81f6 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 7 Aug 2014 09:36:00 +0200 Subject: drm/radeon: add userptr support v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds an IOCTL for turning a pointer supplied by userspace into a buffer object. It imposes several restrictions upon the memory being mapped: 1. It must be page aligned (both start/end addresses, i.e ptr and size). 2. It must be normal system memory, not a pointer into another map of IO space (e.g. it must not be a GTT mmapping of another object). 3. The BO is mapped into GTT, so the maximum amount of memory mapped at all times is still the GTT limit. 4. The BO is only mapped readonly for now, so no write support. 5. List of backing pages is only acquired once, so they represent a snapshot of the first use. Exporting and sharing as well as mapping of buffer objects created by this function is forbidden and results in an -EPERM. v2: squash all previous changes into first public version v3: fix tabs, map readonly, don't use MM callback any more v4: set TTM_PAGE_FLAG_SG so that TTM never messes with the pages, pin/unpin pages on bind/unbind instead of populate/unpopulate v5: rebased on 3.17-wip, IOCTL renamed to userptr, reject any unknown flags, better handle READONLY flag, improve permission check v6: fix ptr cast warning, use set_page_dirty/mark_page_accessed on unpin v7: add warning about it's availability in the API definition v8: drop access_ok check, fix VM mapping bits Signed-off-by: Christian König Reviewed-by: Alex Deucher (v4) Reviewed-by: Jérôme Glisse (v4) Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon.h | 6 ++ drivers/gpu/drm/radeon/radeon_cs.c | 25 +++++- drivers/gpu/drm/radeon/radeon_drv.c | 5 +- drivers/gpu/drm/radeon/radeon_gem.c | 68 ++++++++++++++++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + drivers/gpu/drm/radeon/radeon_object.c | 3 + drivers/gpu/drm/radeon/radeon_prime.c | 10 +++ drivers/gpu/drm/radeon/radeon_ttm.c | 145 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_vm.c | 3 + include/uapi/drm/radeon_drm.h | 16 ++++ 10 files changed, 279 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 9e1732eb402c..6f38a23a5810 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -2138,6 +2138,8 @@ int radeon_gem_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); int radeon_gem_create_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); +int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp); int radeon_gem_pin_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); int radeon_gem_unpin_ioctl(struct drm_device *dev, void *data, @@ -2871,6 +2873,10 @@ extern void radeon_legacy_set_clock_gating(struct radeon_device *rdev, int enabl extern void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable); extern void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain); extern bool radeon_ttm_bo_is_radeon_bo(struct ttm_buffer_object *bo); +extern int radeon_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, + uint32_t flags); +extern bool radeon_ttm_tt_has_userptr(struct ttm_tt *ttm); +extern bool radeon_ttm_tt_is_readonly(struct ttm_tt *ttm); extern void radeon_vram_location(struct radeon_device *rdev, struct radeon_mc *mc, u64 base); extern void radeon_gtt_location(struct radeon_device *rdev, struct radeon_mc *mc); extern int radeon_resume_kms(struct drm_device *dev, bool resume, bool fbcon); diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index ee712c199b25..1321491cf499 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -78,7 +78,8 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) struct radeon_cs_chunk *chunk; struct radeon_cs_buckets buckets; unsigned i, j; - bool duplicate; + bool duplicate, need_mmap_lock = false; + int r; if (p->chunk_relocs_idx == -1) { return 0; @@ -164,6 +165,19 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) p->relocs[i].allowed_domains = domain; } + if (radeon_ttm_tt_has_userptr(p->relocs[i].robj->tbo.ttm)) { + uint32_t domain = p->relocs[i].prefered_domains; + if (!(domain & RADEON_GEM_DOMAIN_GTT)) { + DRM_ERROR("Only RADEON_GEM_DOMAIN_GTT is " + "allowed for userptr BOs\n"); + return -EINVAL; + } + need_mmap_lock = true; + domain = RADEON_GEM_DOMAIN_GTT; + p->relocs[i].prefered_domains = domain; + p->relocs[i].allowed_domains = domain; + } + p->relocs[i].tv.bo = &p->relocs[i].robj->tbo; p->relocs[i].handle = r->handle; @@ -176,8 +190,15 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) if (p->cs_flags & RADEON_CS_USE_VM) p->vm_bos = radeon_vm_get_bos(p->rdev, p->ib.vm, &p->validated); + if (need_mmap_lock) + down_read(¤t->mm->mmap_sem); + + r = radeon_bo_list_validate(p->rdev, &p->ticket, &p->validated, p->ring); - return radeon_bo_list_validate(p->rdev, &p->ticket, &p->validated, p->ring); + if (need_mmap_lock) + up_read(¤t->mm->mmap_sem); + + return r; } static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority) diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index a773830c6c40..5b18af926527 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -114,6 +114,9 @@ int radeon_gem_object_open(struct drm_gem_object *obj, struct drm_file *file_priv); void radeon_gem_object_close(struct drm_gem_object *obj, struct drm_file *file_priv); +struct dma_buf *radeon_gem_prime_export(struct drm_device *dev, + struct drm_gem_object *gobj, + int flags); extern int radeon_get_crtc_scanoutpos(struct drm_device *dev, int crtc, unsigned int flags, int *vpos, int *hpos, ktime_t *stime, @@ -568,7 +571,7 @@ static struct drm_driver kms_driver = { .prime_handle_to_fd = drm_gem_prime_handle_to_fd, .prime_fd_to_handle = drm_gem_prime_fd_to_handle, - .gem_prime_export = drm_gem_prime_export, + .gem_prime_export = radeon_gem_prime_export, .gem_prime_import = drm_gem_prime_import, .gem_prime_pin = radeon_gem_prime_pin, .gem_prime_unpin = radeon_gem_prime_unpin, diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index bfd7e1b0ff3f..993ab223b503 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -272,6 +272,65 @@ int radeon_gem_create_ioctl(struct drm_device *dev, void *data, return 0; } +int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + struct radeon_device *rdev = dev->dev_private; + struct drm_radeon_gem_userptr *args = data; + struct drm_gem_object *gobj; + struct radeon_bo *bo; + uint32_t handle; + int r; + + if (offset_in_page(args->addr | args->size)) + return -EINVAL; + + /* we only support read only mappings for now */ + if (!(args->flags & RADEON_GEM_USERPTR_READONLY)) + return -EACCES; + + /* reject unknown flag values */ + if (args->flags & ~RADEON_GEM_USERPTR_READONLY) + return -EINVAL; + + /* readonly pages not tested on older hardware */ + if (rdev->family < CHIP_R600) + return -EINVAL; + + down_read(&rdev->exclusive_lock); + + /* create a gem object to contain this object in */ + r = radeon_gem_object_create(rdev, args->size, 0, + RADEON_GEM_DOMAIN_CPU, 0, + false, &gobj); + if (r) + goto handle_lockup; + + bo = gem_to_radeon_bo(gobj); + r = radeon_ttm_tt_set_userptr(bo->tbo.ttm, args->addr, args->flags); + if (r) + goto release_object; + + r = drm_gem_handle_create(filp, gobj, &handle); + /* drop reference from allocate - handle holds it now */ + drm_gem_object_unreference_unlocked(gobj); + if (r) + goto handle_lockup; + + args->handle = handle; + up_read(&rdev->exclusive_lock); + return 0; + +release_object: + drm_gem_object_unreference_unlocked(gobj); + +handle_lockup: + up_read(&rdev->exclusive_lock); + r = radeon_gem_handle_lockup(rdev, r); + + return r; +} + int radeon_gem_set_domain_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { @@ -315,6 +374,10 @@ int radeon_mode_dumb_mmap(struct drm_file *filp, return -ENOENT; } robj = gem_to_radeon_bo(gobj); + if (radeon_ttm_tt_has_userptr(robj->tbo.ttm)) { + drm_gem_object_unreference_unlocked(gobj); + return -EPERM; + } *offset_p = radeon_bo_mmap_offset(robj); drm_gem_object_unreference_unlocked(gobj); return 0; @@ -532,6 +595,11 @@ int radeon_gem_op_ioctl(struct drm_device *dev, void *data, return -ENOENT; } robj = gem_to_radeon_bo(gobj); + + r = -EPERM; + if (radeon_ttm_tt_has_userptr(robj->tbo.ttm)) + goto out; + r = radeon_bo_reserve(robj, false); if (unlikely(r)) goto out; diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index eb7164d07985..8309b11e674d 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(RADEON_GEM_USERPTR, radeon_gem_userptr_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), }; int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms); diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 480c87d8edc5..c73c1e320585 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -264,6 +264,9 @@ int radeon_bo_pin_restricted(struct radeon_bo *bo, u32 domain, u64 max_offset, { int r, i; + if (radeon_ttm_tt_has_userptr(bo->tbo.ttm)) + return -EPERM; + if (bo->pin_count) { bo->pin_count++; if (gpu_addr) diff --git a/drivers/gpu/drm/radeon/radeon_prime.c b/drivers/gpu/drm/radeon/radeon_prime.c index f7e48d329db3..bb18bc74b7d7 100644 --- a/drivers/gpu/drm/radeon/radeon_prime.c +++ b/drivers/gpu/drm/radeon/radeon_prime.c @@ -103,3 +103,13 @@ void radeon_gem_prime_unpin(struct drm_gem_object *obj) radeon_bo_unpin(bo); radeon_bo_unreserve(bo); } + +struct dma_buf *radeon_gem_prime_export(struct drm_device *dev, + struct drm_gem_object *gobj, + int flags) +{ + struct radeon_bo *bo = gem_to_radeon_bo(gobj); + if (radeon_ttm_tt_has_userptr(bo->tbo.ttm)) + return ERR_PTR(-EPERM); + return drm_gem_prime_export(dev, gobj, flags); +} diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 72afe82a95c9..b20933fa35c6 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include "radeon_reg.h" #include "radeon.h" @@ -515,8 +517,92 @@ struct radeon_ttm_tt { struct ttm_dma_tt ttm; struct radeon_device *rdev; u64 offset; + + uint64_t userptr; + struct mm_struct *usermm; + uint32_t userflags; }; +/* prepare the sg table with the user pages */ +static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) +{ + struct radeon_device *rdev = radeon_get_rdev(ttm->bdev); + struct radeon_ttm_tt *gtt = (void *)ttm; + unsigned pinned = 0, nents; + int r; + + int write = !(gtt->userflags & RADEON_GEM_USERPTR_READONLY); + enum dma_data_direction direction = write ? + DMA_BIDIRECTIONAL : DMA_TO_DEVICE; + + if (current->mm != gtt->usermm) + return -EPERM; + + do { + unsigned num_pages = ttm->num_pages - pinned; + uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; + struct page **pages = ttm->pages + pinned; + + r = get_user_pages(current, current->mm, userptr, num_pages, + write, 0, pages, NULL); + if (r < 0) + goto release_pages; + + pinned += r; + + } while (pinned < ttm->num_pages); + + r = sg_alloc_table_from_pages(ttm->sg, ttm->pages, ttm->num_pages, 0, + ttm->num_pages << PAGE_SHIFT, + GFP_KERNEL); + if (r) + goto release_sg; + + r = -ENOMEM; + nents = dma_map_sg(rdev->dev, ttm->sg->sgl, ttm->sg->nents, direction); + if (nents != ttm->sg->nents) + goto release_sg; + + drm_prime_sg_to_page_addr_arrays(ttm->sg, ttm->pages, + gtt->ttm.dma_address, ttm->num_pages); + + return 0; + +release_sg: + kfree(ttm->sg); + +release_pages: + release_pages(ttm->pages, pinned, 0); + return r; +} + +static void radeon_ttm_tt_unpin_userptr(struct ttm_tt *ttm) +{ + struct radeon_device *rdev = radeon_get_rdev(ttm->bdev); + struct radeon_ttm_tt *gtt = (void *)ttm; + struct scatterlist *sg; + int i; + + int write = !(gtt->userflags & RADEON_GEM_USERPTR_READONLY); + enum dma_data_direction direction = write ? + DMA_BIDIRECTIONAL : DMA_TO_DEVICE; + + /* free the sg table and pages again */ + dma_unmap_sg(rdev->dev, ttm->sg->sgl, ttm->sg->nents, direction); + + for_each_sg(ttm->sg->sgl, sg, ttm->sg->nents, i) { + struct page *page = sg_page(sg); + + if (!(gtt->userflags & RADEON_GEM_USERPTR_READONLY)) + set_page_dirty(page); + + mark_page_accessed(page); + page_cache_release(page); + } + + sg_free_table(ttm->sg); +} + static int radeon_ttm_backend_bind(struct ttm_tt *ttm, struct ttm_mem_reg *bo_mem) { @@ -525,6 +611,11 @@ static int radeon_ttm_backend_bind(struct ttm_tt *ttm, RADEON_GART_PAGE_WRITE; int r; + if (gtt->userptr) { + radeon_ttm_tt_pin_userptr(ttm); + flags &= ~RADEON_GART_PAGE_WRITE; + } + gtt->offset = (unsigned long)(bo_mem->start << PAGE_SHIFT); if (!ttm->num_pages) { WARN(1, "nothing to bind %lu pages for mreg %p back %p!\n", @@ -547,6 +638,10 @@ static int radeon_ttm_backend_unbind(struct ttm_tt *ttm) struct radeon_ttm_tt *gtt = (void *)ttm; radeon_gart_unbind(gtt->rdev, gtt->offset, ttm->num_pages); + + if (gtt->userptr) + radeon_ttm_tt_unpin_userptr(ttm); + return 0; } @@ -603,6 +698,16 @@ static int radeon_ttm_tt_populate(struct ttm_tt *ttm) if (ttm->state != tt_unpopulated) return 0; + if (gtt->userptr) { + ttm->sg = kcalloc(1, sizeof(struct sg_table), GFP_KERNEL); + if (!ttm->sg) + return -ENOMEM; + + ttm->page_flags |= TTM_PAGE_FLAG_SG; + ttm->state = tt_unbound; + return 0; + } + if (slave && ttm->sg) { drm_prime_sg_to_page_addr_arrays(ttm->sg, ttm->pages, gtt->ttm.dma_address, ttm->num_pages); @@ -652,6 +757,12 @@ static void radeon_ttm_tt_unpopulate(struct ttm_tt *ttm) unsigned i; bool slave = !!(ttm->page_flags & TTM_PAGE_FLAG_SG); + if (gtt->userptr) { + kfree(ttm->sg); + ttm->page_flags &= ~TTM_PAGE_FLAG_SG; + return; + } + if (slave) return; @@ -680,6 +791,40 @@ static void radeon_ttm_tt_unpopulate(struct ttm_tt *ttm) ttm_pool_unpopulate(ttm); } +int radeon_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, + uint32_t flags) +{ + struct radeon_ttm_tt *gtt = (void *)ttm; + + if (gtt == NULL) + return -EINVAL; + + gtt->userptr = addr; + gtt->usermm = current->mm; + gtt->userflags = flags; + return 0; +} + +bool radeon_ttm_tt_has_userptr(struct ttm_tt *ttm) +{ + struct radeon_ttm_tt *gtt = (void *)ttm; + + if (gtt == NULL) + return false; + + return !!gtt->userptr; +} + +bool radeon_ttm_tt_is_readonly(struct ttm_tt *ttm) +{ + struct radeon_ttm_tt *gtt = (void *)ttm; + + if (gtt == NULL) + return false; + + return !!(gtt->userflags & RADEON_GEM_USERPTR_READONLY); +} + static struct ttm_bo_driver radeon_bo_driver = { .ttm_tt_create = &radeon_ttm_tt_create, .ttm_tt_populate = &radeon_ttm_tt_populate, diff --git a/drivers/gpu/drm/radeon/radeon_vm.c b/drivers/gpu/drm/radeon/radeon_vm.c index ccae4d9dc3de..0e107c5650bf 100644 --- a/drivers/gpu/drm/radeon/radeon_vm.c +++ b/drivers/gpu/drm/radeon/radeon_vm.c @@ -888,6 +888,9 @@ int radeon_vm_bo_update(struct radeon_device *rdev, bo_va->flags &= ~RADEON_VM_PAGE_VALID; bo_va->flags &= ~RADEON_VM_PAGE_SYSTEM; bo_va->flags &= ~RADEON_VM_PAGE_SNOOPED; + if (bo_va->bo && radeon_ttm_tt_is_readonly(bo_va->bo->tbo.ttm)) + bo_va->flags &= ~RADEON_VM_PAGE_WRITEABLE; + if (mem) { addr = mem->start << PAGE_SHIFT; if (mem->mem_type != TTM_PL_SYSTEM) { diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 509b2d7a41b7..3a9f20930372 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -511,6 +511,7 @@ typedef struct { #define DRM_RADEON_GEM_BUSY 0x2a #define DRM_RADEON_GEM_VA 0x2b #define DRM_RADEON_GEM_OP 0x2c +#define DRM_RADEON_GEM_USERPTR 0x2d #define DRM_IOCTL_RADEON_CP_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t) #define DRM_IOCTL_RADEON_CP_START DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_START) @@ -554,6 +555,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) #define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op) +#define DRM_IOCTL_RADEON_GEM_USERPTR DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_USERPTR, struct drm_radeon_gem_userptr) typedef struct drm_radeon_init { enum { @@ -808,6 +810,20 @@ struct drm_radeon_gem_create { uint32_t flags; }; +/* + * This is not a reliable API and you should expect it to fail for any + * number of reasons and have fallback path that do not use userptr to + * perform any operation. + */ +#define RADEON_GEM_USERPTR_READONLY (1 << 0) + +struct drm_radeon_gem_userptr { + uint64_t addr; + uint64_t size; + uint32_t flags; + uint32_t handle; +}; + #define RADEON_TILING_MACRO 0x1 #define RADEON_TILING_MICRO 0x2 #define RADEON_TILING_SWAP_16BIT 0x4 -- cgit v1.2.3 From ddd00e33e17a62c5f44377ab42e7562ccfae7bd1 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 7 Aug 2014 09:36:01 +0200 Subject: drm/radeon: add userptr flag to limit it to anonymous memory v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid problems with writeback by limiting userptr to anonymous memory. v2: add commit and code comments Signed-off-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_gem.c | 3 ++- drivers/gpu/drm/radeon/radeon_ttm.c | 10 ++++++++++ include/uapi/drm/radeon_drm.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 993ab223b503..032736b429bf 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -290,7 +290,8 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, return -EACCES; /* reject unknown flag values */ - if (args->flags & ~RADEON_GEM_USERPTR_READONLY) + if (args->flags & ~(RADEON_GEM_USERPTR_READONLY | + RADEON_GEM_USERPTR_ANONONLY)) return -EINVAL; /* readonly pages not tested on older hardware */ diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index b20933fa35c6..12e37b1ddc40 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -538,6 +538,16 @@ static int radeon_ttm_tt_pin_userptr(struct ttm_tt *ttm) if (current->mm != gtt->usermm) return -EPERM; + if (gtt->userflags & RADEON_GEM_USERPTR_ANONONLY) { + /* check that we only pin down anonymous memory + to prevent problems with writeback */ + unsigned long end = gtt->userptr + ttm->num_pages * PAGE_SIZE; + struct vm_area_struct *vma; + vma = find_vma(gtt->usermm, gtt->userptr); + if (!vma || vma->vm_file || vma->vm_end < end) + return -EPERM; + } + do { unsigned num_pages = ttm->num_pages - pinned; uint64_t userptr = gtt->userptr + pinned * PAGE_SIZE; diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 3a9f20930372..9720e1a36848 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -816,6 +816,7 @@ struct drm_radeon_gem_create { * perform any operation. */ #define RADEON_GEM_USERPTR_READONLY (1 << 0) +#define RADEON_GEM_USERPTR_ANONONLY (1 << 1) struct drm_radeon_gem_userptr { uint64_t addr; -- cgit v1.2.3 From 2a84a4476d6e13de72472f6ca4338aed0a8269b8 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 7 Aug 2014 09:36:02 +0200 Subject: drm/radeon: add userptr flag to directly validate the BO to GTT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This way we test userptr availability at BO creation time instead of first use. Signed-off-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_gem.c | 18 +++++++++++++++++- include/uapi/drm/radeon_drm.h | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 032736b429bf..450656027aba 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -291,7 +291,7 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, /* reject unknown flag values */ if (args->flags & ~(RADEON_GEM_USERPTR_READONLY | - RADEON_GEM_USERPTR_ANONONLY)) + RADEON_GEM_USERPTR_ANONONLY | RADEON_GEM_USERPTR_VALIDATE)) return -EINVAL; /* readonly pages not tested on older hardware */ @@ -312,6 +312,22 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, if (r) goto release_object; + if (args->flags & RADEON_GEM_USERPTR_VALIDATE) { + down_read(¤t->mm->mmap_sem); + r = radeon_bo_reserve(bo, true); + if (r) { + up_read(¤t->mm->mmap_sem); + goto release_object; + } + + radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_GTT); + r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); + radeon_bo_unreserve(bo); + up_read(¤t->mm->mmap_sem); + if (r) + goto release_object; + } + r = drm_gem_handle_create(filp, gobj, &handle); /* drop reference from allocate - handle holds it now */ drm_gem_object_unreference_unlocked(gobj); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 9720e1a36848..5dc61c2d4c73 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -817,6 +817,7 @@ struct drm_radeon_gem_create { */ #define RADEON_GEM_USERPTR_READONLY (1 << 0) #define RADEON_GEM_USERPTR_ANONONLY (1 << 1) +#define RADEON_GEM_USERPTR_VALIDATE (1 << 2) struct drm_radeon_gem_userptr { uint64_t addr; -- cgit v1.2.3 From 341cb9e426fac32523427c80c67543a16be46605 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 7 Aug 2014 09:36:03 +0200 Subject: drm/radeon: add userptr flag to register MMU notifier v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Whenever userspace mapping related to our userptr change we wait for it to become idle and unmap it from GTT. v2: rebased, fix mutex unlock in error path v3: improve commit message Signed-off-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 12 ++ drivers/gpu/drm/radeon/radeon_device.c | 2 + drivers/gpu/drm/radeon/radeon_gem.c | 9 +- drivers/gpu/drm/radeon/radeon_mn.c | 272 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_object.c | 1 + include/uapi/drm/radeon_drm.h | 1 + 8 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_mn.c (limited to 'include/uapi') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index b066bb3ca01a..358b6e8697e9 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -115,6 +115,7 @@ config DRM_RADEON select HWMON select BACKLIGHT_CLASS_DEVICE select INTERVAL_TREE + select MMU_NOTIFIER help Choose this option if you have an ATI Radeon graphics card. There are both PCI and AGP versions. You don't need to choose this to diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 0013ad0db9ef..c7fa1aeb8c3f 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -80,7 +80,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ r600_dpm.o rs780_dpm.o rv6xx_dpm.o rv770_dpm.o rv730_dpm.o rv740_dpm.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ - ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o + ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o # add async DMA block radeon-y += \ diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 6f38a23a5810..542da8208674 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -487,6 +488,9 @@ struct radeon_bo { struct ttm_bo_kmap_obj dma_buf_vmap; pid_t pid; + + struct radeon_mn *mn; + struct interval_tree_node mn_it; }; #define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, gem_base) @@ -1725,6 +1729,11 @@ void radeon_test_ring_sync(struct radeon_device *rdev, struct radeon_ring *cpB); void radeon_test_syncing(struct radeon_device *rdev); +/* + * MMU Notifier + */ +int radeon_mn_register(struct radeon_bo *bo, unsigned long addr); +void radeon_mn_unregister(struct radeon_bo *bo); /* * Debugfs @@ -2372,6 +2381,9 @@ struct radeon_device { /* tracking pinned memory */ u64 vram_pin_size; u64 gart_pin_size; + + struct mutex mn_lock; + DECLARE_HASHTABLE(mn_hash, 7); }; bool radeon_is_px(struct drm_device *dev); diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index c8ea050c8fa4..c58f84f3c6a5 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1270,6 +1270,8 @@ int radeon_device_init(struct radeon_device *rdev, init_rwsem(&rdev->pm.mclk_lock); init_rwsem(&rdev->exclusive_lock); init_waitqueue_head(&rdev->irq.vblank_queue); + mutex_init(&rdev->mn_lock); + hash_init(rdev->mn_hash); r = radeon_gem_init(rdev); if (r) return r; diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 450656027aba..2a6fbf101cf0 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -291,7 +291,8 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, /* reject unknown flag values */ if (args->flags & ~(RADEON_GEM_USERPTR_READONLY | - RADEON_GEM_USERPTR_ANONONLY | RADEON_GEM_USERPTR_VALIDATE)) + RADEON_GEM_USERPTR_ANONONLY | RADEON_GEM_USERPTR_VALIDATE | + RADEON_GEM_USERPTR_REGISTER)) return -EINVAL; /* readonly pages not tested on older hardware */ @@ -312,6 +313,12 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, if (r) goto release_object; + if (args->flags & RADEON_GEM_USERPTR_REGISTER) { + r = radeon_mn_register(bo, args->addr); + if (r) + goto release_object; + } + if (args->flags & RADEON_GEM_USERPTR_VALIDATE) { down_read(¤t->mm->mmap_sem); r = radeon_bo_reserve(bo, true); diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c new file mode 100644 index 000000000000..0157bc2f11f8 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -0,0 +1,272 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +/* + * Authors: + * Christian König + */ + +#include +#include +#include +#include +#include + +#include "radeon.h" + +struct radeon_mn { + /* constant after initialisation */ + struct radeon_device *rdev; + struct mm_struct *mm; + struct mmu_notifier mn; + + /* only used on destruction */ + struct work_struct work; + + /* protected by rdev->mn_lock */ + struct hlist_node node; + + /* objects protected by lock */ + struct mutex lock; + struct rb_root objects; +}; + +/** + * radeon_mn_destroy - destroy the rmn + * + * @work: previously sheduled work item + * + * Lazy destroys the notifier from a work item + */ +static void radeon_mn_destroy(struct work_struct *work) +{ + struct radeon_mn *rmn = container_of(work, struct radeon_mn, work); + struct radeon_device *rdev = rmn->rdev; + struct radeon_bo *bo, *next; + + mutex_lock(&rdev->mn_lock); + mutex_lock(&rmn->lock); + hash_del(&rmn->node); + rbtree_postorder_for_each_entry_safe(bo, next, &rmn->objects, mn_it.rb) { + interval_tree_remove(&bo->mn_it, &rmn->objects); + bo->mn = NULL; + } + mutex_unlock(&rmn->lock); + mutex_unlock(&rdev->mn_lock); + mmu_notifier_unregister(&rmn->mn, rmn->mm); + kfree(rmn); +} + +/** + * radeon_mn_release - callback to notify about mm destruction + * + * @mn: our notifier + * @mn: the mm this callback is about + * + * Shedule a work item to lazy destroy our notifier. + */ +static void radeon_mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); + INIT_WORK(&rmn->work, radeon_mn_destroy); + schedule_work(&rmn->work); +} + +/** + * radeon_mn_invalidate_range_start - callback to notify about mm change + * + * @mn: our notifier + * @mn: the mm this callback is about + * @start: start of updated range + * @end: end of updated range + * + * We block for all BOs between start and end to be idle and + * unmap them by move them into system domain again. + */ +static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); + struct interval_tree_node *it; + + /* notification is exclusive, but interval is inclusive */ + end -= 1; + + mutex_lock(&rmn->lock); + + it = interval_tree_iter_first(&rmn->objects, start, end); + while (it) { + struct radeon_bo *bo; + int r; + + bo = container_of(it, struct radeon_bo, mn_it); + it = interval_tree_iter_next(it, start, end); + + r = radeon_bo_reserve(bo, true); + if (r) { + DRM_ERROR("(%d) failed to reserve user bo\n", r); + continue; + } + + if (bo->tbo.sync_obj) { + r = radeon_fence_wait(bo->tbo.sync_obj, false); + if (r) + DRM_ERROR("(%d) failed to wait for user bo\n", r); + } + + radeon_ttm_placement_from_domain(bo, RADEON_GEM_DOMAIN_CPU); + r = ttm_bo_validate(&bo->tbo, &bo->placement, false, false); + if (r) + DRM_ERROR("(%d) failed to validate user bo\n", r); + + radeon_bo_unreserve(bo); + } + + mutex_unlock(&rmn->lock); +} + +static const struct mmu_notifier_ops radeon_mn_ops = { + .release = radeon_mn_release, + .invalidate_range_start = radeon_mn_invalidate_range_start, +}; + +/** + * radeon_mn_get - create notifier context + * + * @rdev: radeon device pointer + * + * Creates a notifier context for current->mm. + */ +static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev) +{ + struct mm_struct *mm = current->mm; + struct radeon_mn *rmn; + int r; + + down_write(&mm->mmap_sem); + mutex_lock(&rdev->mn_lock); + + hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm) + if (rmn->mm == mm) + goto release_locks; + + rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); + if (!rmn) { + rmn = ERR_PTR(-ENOMEM); + goto release_locks; + } + + rmn->rdev = rdev; + rmn->mm = mm; + rmn->mn.ops = &radeon_mn_ops; + mutex_init(&rmn->lock); + rmn->objects = RB_ROOT; + + r = __mmu_notifier_register(&rmn->mn, mm); + if (r) + goto free_rmn; + + hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm); + +release_locks: + mutex_unlock(&rdev->mn_lock); + up_write(&mm->mmap_sem); + + return rmn; + +free_rmn: + mutex_unlock(&rdev->mn_lock); + up_write(&mm->mmap_sem); + kfree(rmn); + + return ERR_PTR(r); +} + +/** + * radeon_mn_register - register a BO for notifier updates + * + * @bo: radeon buffer object + * @addr: userptr addr we should monitor + * + * Registers an MMU notifier for the given BO at the specified address. + * Returns 0 on success, -ERRNO if anything goes wrong. + */ +int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) +{ + unsigned long end = addr + radeon_bo_size(bo) - 1; + struct radeon_device *rdev = bo->rdev; + struct radeon_mn *rmn; + struct interval_tree_node *it; + + rmn = radeon_mn_get(rdev); + if (IS_ERR(rmn)) + return PTR_ERR(rmn); + + mutex_lock(&rmn->lock); + + it = interval_tree_iter_first(&rmn->objects, addr, end); + if (it) { + mutex_unlock(&rmn->lock); + return -EEXIST; + } + + bo->mn = rmn; + bo->mn_it.start = addr; + bo->mn_it.last = end; + interval_tree_insert(&bo->mn_it, &rmn->objects); + + mutex_unlock(&rmn->lock); + + return 0; +} + +/** + * radeon_mn_unregister - unregister a BO for notifier updates + * + * @bo: radeon buffer object + * + * Remove any registration of MMU notifier updates from the buffer object. + */ +void radeon_mn_unregister(struct radeon_bo *bo) +{ + struct radeon_device *rdev = bo->rdev; + struct radeon_mn *rmn; + + mutex_lock(&rdev->mn_lock); + rmn = bo->mn; + if (rmn == NULL) { + mutex_unlock(&rdev->mn_lock); + return; + } + + mutex_lock(&rmn->lock); + interval_tree_remove(&bo->mn_it, &rmn->objects); + bo->mn = NULL; + mutex_unlock(&rmn->lock); + mutex_unlock(&rdev->mn_lock); +} diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index c73c1e320585..287523807989 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -75,6 +75,7 @@ static void radeon_ttm_bo_destroy(struct ttm_buffer_object *tbo) bo = container_of(tbo, struct radeon_bo, tbo); radeon_update_memory_usage(bo, bo->tbo.mem.mem_type, -1); + radeon_mn_unregister(bo); mutex_lock(&bo->rdev->gem.mutex); list_del_init(&bo->list); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 5dc61c2d4c73..c77495ffc44f 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -818,6 +818,7 @@ struct drm_radeon_gem_create { #define RADEON_GEM_USERPTR_READONLY (1 << 0) #define RADEON_GEM_USERPTR_ANONONLY (1 << 1) #define RADEON_GEM_USERPTR_VALIDATE (1 << 2) +#define RADEON_GEM_USERPTR_REGISTER (1 << 3) struct drm_radeon_gem_userptr { uint64_t addr; -- cgit v1.2.3 From 18c01ab30288d9d0a7d80b08b659531f37ed379d Mon Sep 17 00:00:00 2001 From: Rajesh Ghanekar Date: Fri, 1 Aug 2014 22:17:30 -0400 Subject: nfsd: allow turning off nfsv3 readdir_plus One of our customer's application only needs file names, not file attributes. With directories having 10K+ inodes (assuming buffer cache has directory blocks cached having file names, but inode cache is limited and hence need eviction of older cached inodes), older inodes are evicted periodically. So if they keep on doing readdir(2) from NSF client on multiple directories, some directory's files are periodically removed from inode cache and hence new readdir(2) on same directory requires disk access to bring back inodes again to inode cache. As READDIRPLUS request fetches attributes also, doing getattr on each file on server, it causes unnecessary disk accesses. If READDIRPLUS on NFS client is returned with -ENOTSUPP, NFS client uses READDIR request which just gets the names of the files in a directory, not attributes, hence avoiding disk accesses on server. There's already a corresponding client-side mount option, but an export option reduces the need for configuration across multiple clients. This flag affects NFSv3 only. If it turns out it's needed for NFSv4 as well then we may have to figure out how to extend the behavior to NFSv4, but it's not currently obvious how to do that. Signed-off-by: Rajesh Ghanekar Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 1 + fs/nfsd/nfs3proc.c | 8 ++++++++ include/uapi/linux/nfsd/export.h | 5 +++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 72ffd7cce3c3..30a739d896ff 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1145,6 +1145,7 @@ static struct flags { { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, + { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index fc51f7f6b36d..12f2aab4f614 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -466,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, resp->buflen = resp->count; resp->rqstp = rqstp; offset = argp->cookie; + + nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) + RETURN_STATUS(nfserr_notsupp); + nfserr = nfsd_readdir(rqstp, &resp->fh, &offset, &resp->common, diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h index cf47c313794e..584b6ef3a5e8 100644 --- a/include/uapi/linux/nfsd/export.h +++ b/include/uapi/linux/nfsd/export.h @@ -28,7 +28,8 @@ #define NFSEXP_ALLSQUASH 0x0008 #define NFSEXP_ASYNC 0x0010 #define NFSEXP_GATHERED_WRITES 0x0020 -/* 40 80 100 currently unused */ +#define NFSEXP_NOREADDIRPLUS 0x0040 +/* 80 100 currently unused */ #define NFSEXP_NOHIDE 0x0200 #define NFSEXP_NOSUBTREECHECK 0x0400 #define NFSEXP_NOAUTHNLM 0x0800 /* Don't authenticate NLM requests - just trust */ @@ -47,7 +48,7 @@ */ #define NFSEXP_V4ROOT 0x10000 /* All flags that we claim to support. (Note we don't support NOACL.) */ -#define NFSEXP_ALLFLAGS 0x17E3F +#define NFSEXP_ALLFLAGS 0x1FE7F /* The flags that may vary depending on security flavor: */ #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \ -- cgit v1.2.3 From e91ded8db57472c20b59b2242b100764cc152a10 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 4 Aug 2014 04:50:41 -0400 Subject: uapi: netfilter_arp: use __u8 instead of u_int8_t Similarly, the u_int8_t type is non-standard and not defined. Change it to use __u8 like the rest of the netfilter headers. Signed-off-by: Mike Frysinger Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_arp/arpt_mangle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter_arp/arpt_mangle.h b/include/uapi/linux/netfilter_arp/arpt_mangle.h index 250f502902bb..8c2b16a1f5a0 100644 --- a/include/uapi/linux/netfilter_arp/arpt_mangle.h +++ b/include/uapi/linux/netfilter_arp/arpt_mangle.h @@ -13,7 +13,7 @@ struct arpt_mangle union { struct in_addr tgt_ip; } u_t; - u_int8_t flags; + __u8 flags; int target; }; -- cgit v1.2.3 From 0fc87864879c46afe145e20ec09c9dba2328e3be Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 28 May 2014 09:38:21 -0300 Subject: [media] v4l: Add test pattern colour component controls In many cases the test pattern has selectable values for each colour component. Implement controls for raw bayer components. Additional controls should be defined for colour components that are not covered by these controls. Signed-off-by: Sakari Ailus Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/controls.xml | 34 ++++++++++++++++++++++++++++ drivers/media/v4l2-core/v4l2-ctrls.c | 4 ++++ include/uapi/linux/v4l2-controls.h | 4 ++++ 3 files changed, 42 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml index 9f5ffd85560b..a7eb1bde8b92 100644 --- a/Documentation/DocBook/media/v4l/controls.xml +++ b/Documentation/DocBook/media/v4l/controls.xml @@ -4790,6 +4790,40 @@ interface and may change in the future. conversion. + + V4L2_CID_TEST_PATTERN_RED + integer + + + Test pattern red colour component. + + + + V4L2_CID_TEST_PATTERN_GREENR + integer + + + Test pattern green (next to red) + colour component. + + + + V4L2_CID_TEST_PATTERN_BLUE + integer + + + Test pattern blue colour component. + + + + V4L2_CID_TEST_PATTERN_GREENB + integer + + + Test pattern green (next to blue) + colour component. + + diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index f030d6a9e044..35d1f3d5045b 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -859,6 +859,10 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_VBLANK: return "Vertical Blanking"; case V4L2_CID_HBLANK: return "Horizontal Blanking"; case V4L2_CID_ANALOGUE_GAIN: return "Analogue Gain"; + case V4L2_CID_TEST_PATTERN_RED: return "Red Pixel Value"; + case V4L2_CID_TEST_PATTERN_GREENR: return "Green (Red) Pixel Value"; + case V4L2_CID_TEST_PATTERN_BLUE: return "Blue Pixel Value"; + case V4L2_CID_TEST_PATTERN_GREENB: return "Green (Blue) Pixel Value"; /* Image processing controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index e946e43fb8d5..8b930210a4b9 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -865,6 +865,10 @@ enum v4l2_jpeg_chroma_subsampling { #define V4L2_CID_VBLANK (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 1) #define V4L2_CID_HBLANK (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 2) #define V4L2_CID_ANALOGUE_GAIN (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 3) +#define V4L2_CID_TEST_PATTERN_RED (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 4) +#define V4L2_CID_TEST_PATTERN_GREENR (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 5) +#define V4L2_CID_TEST_PATTERN_BLUE (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 6) +#define V4L2_CID_TEST_PATTERN_GREENB (V4L2_CID_IMAGE_SOURCE_CLASS_BASE + 7) /* Image processing controls */ -- cgit v1.2.3 From a913d8742e275dd2d80726afac02311a0f49d161 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 26 May 2014 09:46:18 -0300 Subject: [media] smiapp: Add driver-specific test pattern menu item definitions Add numeric definitions for menu items used in the smiapp driver's test pattern menu. Signed-off-by: Sakari Ailus Acked-by: Laurent Pinchart Acked-by: Hans Verkuil Acked-by: Lad, Prabhakar Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/smiapp.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 include/uapi/linux/smiapp.h (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 24e9033f8b3f..4ec377d103c7 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -353,6 +353,7 @@ header-y += serio.h header-y += shm.h header-y += signal.h header-y += signalfd.h +header-y += smiapp.h header-y += snmp.h header-y += sock_diag.h header-y += socket.h diff --git a/include/uapi/linux/smiapp.h b/include/uapi/linux/smiapp.h new file mode 100644 index 000000000000..53938f4412ee --- /dev/null +++ b/include/uapi/linux/smiapp.h @@ -0,0 +1,29 @@ +/* + * include/uapi/linux/smiapp.h + * + * Generic driver for SMIA/SMIA++ compliant camera modules + * + * Copyright (C) 2014 Intel Corporation + * Contact: Sakari Ailus + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#ifndef __UAPI_LINUX_SMIAPP_H_ +#define __UAPI_LINUX_SMIAPP_H_ + +#define V4L2_SMIAPP_TEST_PATTERN_MODE_DISABLED 0 +#define V4L2_SMIAPP_TEST_PATTERN_MODE_SOLID_COLOUR 1 +#define V4L2_SMIAPP_TEST_PATTERN_MODE_COLOUR_BARS 2 +#define V4L2_SMIAPP_TEST_PATTERN_MODE_COLOUR_BARS_GREY 3 +#define V4L2_SMIAPP_TEST_PATTERN_MODE_PN9 4 + +#endif /* __UAPI_LINUX_SMIAPP_H_ */ -- cgit v1.2.3 From e2a093ff0dbfa4c5d99f25241cf33325e9691d91 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Wed, 6 Aug 2014 13:52:49 +0200 Subject: netfilter: nft_meta: add pkttype support Add pkttype support for ip, ipv6 and inet families of tables. This allows you to fetch the meta packet type based on the link layer information. The loopback traffic is a special case, the packet type is guessed from the network layer header. No special handling for bridge and arp since we're not going to see such traffic in the loopback interface. Joint work with Alvaro Neira Ayuso Signed-off-by: Alvaro Neira Ayuso Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 801bdd1e56e3..98144cdd8986 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -571,6 +571,7 @@ enum nft_exthdr_attributes { * @NFT_META_L4PROTO: layer 4 protocol number * @NFT_META_BRI_IIFNAME: packet input bridge interface name * @NFT_META_BRI_OIFNAME: packet output bridge interface name + * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback */ enum nft_meta_keys { NFT_META_LEN, @@ -592,6 +593,7 @@ enum nft_meta_keys { NFT_META_L4PROTO, NFT_META_BRI_IIFNAME, NFT_META_BRI_OIFNAME, + NFT_META_PKTTYPE, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 852b178c6ae7..4f2862fc12c2 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include #include /* for TCP_TIME_WAIT */ @@ -124,6 +127,30 @@ void nft_meta_get_eval(const struct nft_expr *expr, dest->data[0] = skb->secmark; break; #endif + case NFT_META_PKTTYPE: + if (skb->pkt_type != PACKET_LOOPBACK) { + dest->data[0] = skb->pkt_type; + break; + } + + switch (pkt->ops->pf) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + case NFPROTO_IPV6: + if (ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) + dest->data[0] = PACKET_MULTICAST; + else + dest->data[0] = PACKET_BROADCAST; + break; + default: + WARN_ON(1); + goto err; + } + break; default: WARN_ON(1); goto err; @@ -195,6 +222,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: #endif + case NFT_META_PKTTYPE: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From afc5be3079796b024823bad42dc5ebf716453575 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Sun, 24 Aug 2014 14:08:36 +0200 Subject: netfilter: nft_meta: Add cpu attribute support Add cpu support to meta expresion. This allows you to match packets with cpu number. Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 98144cdd8986..c9b6f00a3fb7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -572,6 +572,7 @@ enum nft_exthdr_attributes { * @NFT_META_BRI_IIFNAME: packet input bridge interface name * @NFT_META_BRI_OIFNAME: packet output bridge interface name * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback + * @NFT_META_CPU: cpu id through smp_processor_id() */ enum nft_meta_keys { NFT_META_LEN, @@ -594,6 +595,7 @@ enum nft_meta_keys { NFT_META_BRI_IIFNAME, NFT_META_BRI_OIFNAME, NFT_META_PKTTYPE, + NFT_META_CPU, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 4f2862fc12c2..843e099a962d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include /* for TCP_TIME_WAIT */ @@ -151,6 +152,9 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; } break; + case NFT_META_CPU: + dest->data[0] = smp_processor_id(); + break; default: WARN_ON(1); goto err; @@ -223,6 +227,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_SECMARK: #endif case NFT_META_PKTTYPE: + case NFT_META_CPU: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From fa71f32b5de2be1644ee671ddbe211d79be7847f Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Tue, 29 Jul 2014 17:14:21 +0200 Subject: HID: uhid: add ABI compatible UHID_GET_REPORT replacing UHID_FEATURE The old hdev->hid_get_raw_report() was broken by design. It was never clear what kind of HW request it should trigger. Benjamin fixed that with the core HID cleanup, though we never really adjusted uhid. Unfortunately, our old UHID_FEATURE command was modelled around the broken hid_get_raw_report(). We converted it silently to the new GET_REPORT and nothing broke. Make this explicit by renaming UHID_FEATURE to UHID_GET_REPORT and UHID_FEATURE_ANSWER to UHID_GET_REPORT_REPLY. Note that this is 100% ABI compatible to UHID_FEATURE. This is just a rename. But we have to keep the old definitions around to not break API. >From now on, UHID_GET_REPORT must trigger a GET_REPORT request on the user-space hardware layer. All the ambiguity due to the weird "feature" name should be gone now. Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- drivers/hid/uhid.c | 28 ++++++++++++++-------------- include/uapi/linux/uhid.h | 23 +++++++++++++++++++++-- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 2d2025a027fe..8f5e46b1bb46 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -124,8 +124,8 @@ static int uhid_hid_parse(struct hid_device *hid) return hid_parse_report(hid, uhid->rd_data, uhid->rd_size); } -static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum, - __u8 *buf, size_t count, unsigned char rtype) +static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum, + __u8 *buf, size_t count, unsigned char rtype) { struct uhid_device *uhid = hid->driver_data; __u8 report_type; @@ -133,7 +133,7 @@ static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum, unsigned long flags; int ret; size_t uninitialized_var(len); - struct uhid_feature_answer_req *req; + struct uhid_get_report_reply_req *req; if (!uhid->running) return -EIO; @@ -163,10 +163,10 @@ static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum, } spin_lock_irqsave(&uhid->qlock, flags); - ev->type = UHID_FEATURE; - ev->u.feature.id = ++uhid->report_id; - ev->u.feature.rnum = rnum; - ev->u.feature.rtype = report_type; + ev->type = UHID_GET_REPORT; + ev->u.get_report.id = ++uhid->report_id; + ev->u.get_report.rnum = rnum; + ev->u.get_report.rtype = report_type; uhid->report_running = true; uhid_queue(uhid, ev); @@ -182,7 +182,7 @@ static int uhid_hid_get_raw(struct hid_device *hid, unsigned char rnum, ret = -ERESTARTSYS; } else { spin_lock_irqsave(&uhid->qlock, flags); - req = &uhid->report_buf.u.feature_answer; + req = &uhid->report_buf.u.get_report_reply; if (req->err) { ret = -EIO; @@ -253,7 +253,7 @@ static int uhid_raw_request(struct hid_device *hid, unsigned char reportnum, { switch (reqtype) { case HID_REQ_GET_REPORT: - return uhid_hid_get_raw(hid, reportnum, buf, len, rtype); + return uhid_hid_get_report(hid, reportnum, buf, len, rtype); case HID_REQ_SET_REPORT: /* TODO: implement proper SET_REPORT functionality */ return -ENOSYS; @@ -487,8 +487,8 @@ static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev) return 0; } -static int uhid_dev_feature_answer(struct uhid_device *uhid, - struct uhid_event *ev) +static int uhid_dev_get_report_reply(struct uhid_device *uhid, + struct uhid_event *ev) { unsigned long flags; @@ -498,7 +498,7 @@ static int uhid_dev_feature_answer(struct uhid_device *uhid, spin_lock_irqsave(&uhid->qlock, flags); /* id for old report; drop it silently */ - if (uhid->report_id != ev->u.feature_answer.id) + if (uhid->report_id != ev->u.get_report_reply.id) goto unlock; if (!uhid->report_running) goto unlock; @@ -634,8 +634,8 @@ static ssize_t uhid_char_write(struct file *file, const char __user *buffer, case UHID_INPUT2: ret = uhid_dev_input2(uhid, &uhid->input_buf); break; - case UHID_FEATURE_ANSWER: - ret = uhid_dev_feature_answer(uhid, &uhid->input_buf); + case UHID_GET_REPORT_REPLY: + ret = uhid_dev_get_report_reply(uhid, &uhid->input_buf); break; default: ret = -EOPNOTSUPP; diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h index 1e3b09c191cd..0a08f2bbe642 100644 --- a/include/uapi/linux/uhid.h +++ b/include/uapi/linux/uhid.h @@ -33,8 +33,10 @@ enum uhid_event_type { UHID_OUTPUT, UHID_OUTPUT_EV, /* obsolete! */ UHID_INPUT, - UHID_FEATURE, - UHID_FEATURE_ANSWER, + UHID_FEATURE, /* obsolete! use UHID_GET_REPORT */ + UHID_GET_REPORT = UHID_FEATURE, + UHID_FEATURE_ANSWER, /* obsolete! use UHID_GET_REPORT_REPLY */ + UHID_GET_REPORT_REPLY = UHID_FEATURE_ANSWER, UHID_CREATE2, UHID_INPUT2, }; @@ -98,12 +100,20 @@ struct uhid_output_ev_req { __s32 value; } __attribute__((__packed__)); +/* Obsolete! Kernel uses ABI compatible UHID_GET_REPORT. */ struct uhid_feature_req { __u32 id; __u8 rnum; __u8 rtype; } __attribute__((__packed__)); +struct uhid_get_report_req { + __u32 id; + __u8 rnum; + __u8 rtype; +} __attribute__((__packed__)); + +/* Obsolete! Use ABI compatible UHID_GET_REPORT_REPLY. */ struct uhid_feature_answer_req { __u32 id; __u16 err; @@ -111,6 +121,13 @@ struct uhid_feature_answer_req { __u8 data[UHID_DATA_MAX]; } __attribute__((__packed__)); +struct uhid_get_report_reply_req { + __u32 id; + __u16 err; + __u16 size; + __u8 data[UHID_DATA_MAX]; +} __attribute__((__packed__)); + struct uhid_event { __u32 type; @@ -120,7 +137,9 @@ struct uhid_event { struct uhid_output_req output; struct uhid_output_ev_req output_ev; struct uhid_feature_req feature; + struct uhid_get_report_req get_report; struct uhid_feature_answer_req feature_answer; + struct uhid_get_report_reply_req get_report_reply; struct uhid_create2_req create2; struct uhid_input2_req input2; } u; -- cgit v1.2.3 From 50598e7055d0d8610732e7eb2c84cbc3bc7db294 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Tue, 29 Jul 2014 17:14:22 +0200 Subject: HID: uhid: keep legacy definitions at the bottom of uhid.h Instead of inlining the legacy definitions into the main part of uhid.h, keep them at the bottom now. This way, the API is much easier to read and legacy requests can be looked up at a separate place. Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- include/uapi/linux/uhid.h | 101 ++++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 39 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h index 0a08f2bbe642..116536eeae62 100644 --- a/include/uapi/linux/uhid.h +++ b/include/uapi/linux/uhid.h @@ -24,37 +24,21 @@ #include enum uhid_event_type { - UHID_CREATE, + __UHID_LEGACY_CREATE, UHID_DESTROY, UHID_START, UHID_STOP, UHID_OPEN, UHID_CLOSE, UHID_OUTPUT, - UHID_OUTPUT_EV, /* obsolete! */ - UHID_INPUT, - UHID_FEATURE, /* obsolete! use UHID_GET_REPORT */ - UHID_GET_REPORT = UHID_FEATURE, - UHID_FEATURE_ANSWER, /* obsolete! use UHID_GET_REPORT_REPLY */ - UHID_GET_REPORT_REPLY = UHID_FEATURE_ANSWER, + __UHID_LEGACY_OUTPUT_EV, + __UHID_LEGACY_INPUT, + UHID_GET_REPORT, + UHID_GET_REPORT_REPLY, UHID_CREATE2, UHID_INPUT2, }; -struct uhid_create_req { - __u8 name[128]; - __u8 phys[64]; - __u8 uniq[64]; - __u8 __user *rd_data; - __u16 rd_size; - - __u16 bus; - __u32 vendor; - __u32 product; - __u32 version; - __u32 country; -} __attribute__((__packed__)); - struct uhid_create2_req { __u8 name[128]; __u8 phys[64]; @@ -76,24 +60,67 @@ enum uhid_report_type { UHID_INPUT_REPORT, }; -struct uhid_input_req { +struct uhid_input2_req { + __u16 size; + __u8 data[UHID_DATA_MAX]; +} __attribute__((__packed__)); + +struct uhid_output_req { __u8 data[UHID_DATA_MAX]; __u16 size; + __u8 rtype; } __attribute__((__packed__)); -struct uhid_input2_req { +struct uhid_get_report_req { + __u32 id; + __u8 rnum; + __u8 rtype; +} __attribute__((__packed__)); + +struct uhid_get_report_reply_req { + __u32 id; + __u16 err; __u16 size; __u8 data[UHID_DATA_MAX]; } __attribute__((__packed__)); -struct uhid_output_req { +/* + * Compat Layer + * All these commands and requests are obsolete. You should avoid using them in + * new code. We support them for backwards-compatibility, but you might not get + * access to new feature in case you use them. + */ + +enum uhid_legacy_event_type { + UHID_CREATE = __UHID_LEGACY_CREATE, + UHID_OUTPUT_EV = __UHID_LEGACY_OUTPUT_EV, + UHID_INPUT = __UHID_LEGACY_INPUT, + UHID_FEATURE = UHID_GET_REPORT, + UHID_FEATURE_ANSWER = UHID_GET_REPORT_REPLY, +}; + +/* Obsolete! Use UHID_CREATE2. */ +struct uhid_create_req { + __u8 name[128]; + __u8 phys[64]; + __u8 uniq[64]; + __u8 __user *rd_data; + __u16 rd_size; + + __u16 bus; + __u32 vendor; + __u32 product; + __u32 version; + __u32 country; +} __attribute__((__packed__)); + +/* Obsolete! Use UHID_INPUT2. */ +struct uhid_input_req { __u8 data[UHID_DATA_MAX]; __u16 size; - __u8 rtype; } __attribute__((__packed__)); -/* Obsolete! Newer kernels will no longer send these events but instead convert - * it into raw output reports via UHID_OUTPUT. */ +/* Obsolete! Kernel uses UHID_OUTPUT exclusively now. */ struct uhid_output_ev_req { __u16 type; __u16 code; @@ -107,12 +134,6 @@ struct uhid_feature_req { __u8 rtype; } __attribute__((__packed__)); -struct uhid_get_report_req { - __u32 id; - __u8 rnum; - __u8 rtype; -} __attribute__((__packed__)); - /* Obsolete! Use ABI compatible UHID_GET_REPORT_REPLY. */ struct uhid_feature_answer_req { __u32 id; @@ -121,12 +142,14 @@ struct uhid_feature_answer_req { __u8 data[UHID_DATA_MAX]; } __attribute__((__packed__)); -struct uhid_get_report_reply_req { - __u32 id; - __u16 err; - __u16 size; - __u8 data[UHID_DATA_MAX]; -} __attribute__((__packed__)); +/* + * UHID Events + * All UHID events from and to the kernel are encoded as "struct uhid_event". + * The "type" field contains a UHID_* type identifier. All payload depends on + * that type and can be accessed via ev->u.XYZ accordingly. + * If user-space writes short events, they're extended with 0s by the kernel. If + * the kernel writes short events, user-space shall extend them with 0s. + */ struct uhid_event { __u32 type; -- cgit v1.2.3 From 11c221553080408b203a00b91ad5f647dfb218d1 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Tue, 29 Jul 2014 17:14:24 +0200 Subject: HID: uhid: implement SET_REPORT We so far lacked support for hid_hw_raw_request(..., HID_REQ_SET_REPORT); Add support for it and simply forward the request to user-space. Note that SET_REPORT is synchronous, just like GET_REPORT, even though it does not provide any data back besides an error code. If a transport layer does SET_REPORT asynchronously, they can just ACK it immediately by writing an uhid_set_report_reply to uhid. This patch re-uses the synchronous uhid-report infrastructure to query user-space. Note that this means you cannot run SET_REPORT and GET_REPORT in parallel. However, that has always been a restriction of HID and due to its blocking nature, this is just fine. Maybe some future transport layer supports parallel requests (very unlikely), however, until then lets not over-complicate things and avoid request-lookup-tables. Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- drivers/hid/uhid.c | 206 +++++++++++++++++++++++++++++++--------------- include/uapi/linux/uhid.h | 17 ++++ 2 files changed, 155 insertions(+), 68 deletions(-) (limited to 'include/uapi') diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 8bf613e3783d..19511481a7d3 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -49,6 +49,7 @@ struct uhid_device { wait_queue_head_t report_wait; bool report_running; u32 report_id; + u32 report_type; struct uhid_event report_buf; }; @@ -124,95 +125,166 @@ static int uhid_hid_parse(struct hid_device *hid) return hid_parse_report(hid, uhid->rd_data, uhid->rd_size); } +/* must be called with report_lock held */ +static int __uhid_report_queue_and_wait(struct uhid_device *uhid, + struct uhid_event *ev, + __u32 *report_id) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&uhid->qlock, flags); + *report_id = ++uhid->report_id; + uhid->report_type = ev->type; + uhid->report_running = true; + uhid_queue(uhid, ev); + spin_unlock_irqrestore(&uhid->qlock, flags); + + ret = wait_event_interruptible_timeout(uhid->report_wait, + !uhid->report_running || !uhid->running, + 5 * HZ); + if (!ret || !uhid->running || uhid->report_running) + ret = -EIO; + else if (ret < 0) + ret = -ERESTARTSYS; + else + ret = 0; + + uhid->report_running = false; + + return ret; +} + +static void uhid_report_wake_up(struct uhid_device *uhid, u32 id, + const struct uhid_event *ev) +{ + unsigned long flags; + + spin_lock_irqsave(&uhid->qlock, flags); + + /* id for old report; drop it silently */ + if (uhid->report_type != ev->type || uhid->report_id != id) + goto unlock; + if (!uhid->report_running) + goto unlock; + + memcpy(&uhid->report_buf, ev, sizeof(*ev)); + uhid->report_running = false; + wake_up_interruptible(&uhid->report_wait); + +unlock: + spin_unlock_irqrestore(&uhid->qlock, flags); +} + static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum, - __u8 *buf, size_t count, unsigned char rtype) + u8 *buf, size_t count, u8 rtype) { struct uhid_device *uhid = hid->driver_data; - __u8 report_type; + struct uhid_get_report_reply_req *req; struct uhid_event *ev; - unsigned long flags; int ret; - size_t uninitialized_var(len); - struct uhid_get_report_reply_req *req; if (!uhid->running) return -EIO; - switch (rtype) { - case HID_FEATURE_REPORT: - report_type = UHID_FEATURE_REPORT; - break; - case HID_OUTPUT_REPORT: - report_type = UHID_OUTPUT_REPORT; - break; - case HID_INPUT_REPORT: - report_type = UHID_INPUT_REPORT; - break; - default: - return -EINVAL; - } + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->type = UHID_GET_REPORT; + ev->u.get_report.rnum = rnum; + ev->u.get_report.rtype = rtype; ret = mutex_lock_interruptible(&uhid->report_lock); - if (ret) + if (ret) { + kfree(ev); return ret; + } - ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) { - ret = -ENOMEM; + /* this _always_ takes ownership of @ev */ + ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.get_report.id); + if (ret) goto unlock; + + req = &uhid->report_buf.u.get_report_reply; + if (req->err) { + ret = -EIO; + } else { + ret = min3(count, (size_t)req->size, (size_t)UHID_DATA_MAX); + memcpy(buf, req->data, ret); } - spin_lock_irqsave(&uhid->qlock, flags); - ev->type = UHID_GET_REPORT; - ev->u.get_report.id = ++uhid->report_id; - ev->u.get_report.rnum = rnum; - ev->u.get_report.rtype = report_type; +unlock: + mutex_unlock(&uhid->report_lock); + return ret; +} - uhid->report_running = true; - uhid_queue(uhid, ev); - spin_unlock_irqrestore(&uhid->qlock, flags); +static int uhid_hid_set_report(struct hid_device *hid, unsigned char rnum, + const u8 *buf, size_t count, u8 rtype) +{ + struct uhid_device *uhid = hid->driver_data; + struct uhid_event *ev; + int ret; - ret = wait_event_interruptible_timeout(uhid->report_wait, - !uhid->report_running || !uhid->running, - 5 * HZ); + if (!uhid->running || count > UHID_DATA_MAX) + return -EIO; - if (!ret || !uhid->running) { - ret = -EIO; - } else if (ret < 0) { - ret = -ERESTARTSYS; - } else { - spin_lock_irqsave(&uhid->qlock, flags); - req = &uhid->report_buf.u.get_report_reply; + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; - if (req->err) { - ret = -EIO; - } else { - ret = 0; - len = min(count, - min_t(size_t, req->size, UHID_DATA_MAX)); - memcpy(buf, req->data, len); - } + ev->type = UHID_SET_REPORT; + ev->u.set_report.rnum = rnum; + ev->u.set_report.rtype = rtype; + ev->u.set_report.size = count; + memcpy(ev->u.set_report.data, buf, count); - spin_unlock_irqrestore(&uhid->qlock, flags); + ret = mutex_lock_interruptible(&uhid->report_lock); + if (ret) { + kfree(ev); + return ret; } - uhid->report_running = false; + /* this _always_ takes ownership of @ev */ + ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.set_report.id); + if (ret) + goto unlock; + + if (uhid->report_buf.u.set_report_reply.err) + ret = -EIO; + else + ret = count; unlock: mutex_unlock(&uhid->report_lock); - return ret ? ret : len; + return ret; } static int uhid_hid_raw_request(struct hid_device *hid, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype) { + u8 u_rtype; + + switch (rtype) { + case HID_FEATURE_REPORT: + u_rtype = UHID_FEATURE_REPORT; + break; + case HID_OUTPUT_REPORT: + u_rtype = UHID_OUTPUT_REPORT; + break; + case HID_INPUT_REPORT: + u_rtype = UHID_INPUT_REPORT; + break; + default: + return -EINVAL; + } + switch (reqtype) { case HID_REQ_GET_REPORT: - return uhid_hid_get_report(hid, reportnum, buf, len, rtype); + return uhid_hid_get_report(hid, reportnum, buf, len, u_rtype); case HID_REQ_SET_REPORT: - /* TODO: implement proper SET_REPORT functionality */ - return -ENOSYS; + return uhid_hid_set_report(hid, reportnum, buf, len, u_rtype); default: return -EIO; } @@ -490,25 +562,20 @@ static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev) static int uhid_dev_get_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { - unsigned long flags; - if (!uhid->running) return -EINVAL; - spin_lock_irqsave(&uhid->qlock, flags); - - /* id for old report; drop it silently */ - if (uhid->report_id != ev->u.get_report_reply.id) - goto unlock; - if (!uhid->report_running) - goto unlock; + uhid_report_wake_up(uhid, ev->u.get_report_reply.id, ev); + return 0; +} - memcpy(&uhid->report_buf, ev, sizeof(*ev)); - uhid->report_running = false; - wake_up_interruptible(&uhid->report_wait); +static int uhid_dev_set_report_reply(struct uhid_device *uhid, + struct uhid_event *ev) +{ + if (!uhid->running) + return -EINVAL; -unlock: - spin_unlock_irqrestore(&uhid->qlock, flags); + uhid_report_wake_up(uhid, ev->u.set_report_reply.id, ev); return 0; } @@ -637,6 +704,9 @@ static ssize_t uhid_char_write(struct file *file, const char __user *buffer, case UHID_GET_REPORT_REPLY: ret = uhid_dev_get_report_reply(uhid, &uhid->input_buf); break; + case UHID_SET_REPORT_REPLY: + ret = uhid_dev_set_report_reply(uhid, &uhid->input_buf); + break; default: ret = -EOPNOTSUPP; } diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h index 116536eeae62..62aac0e4edf3 100644 --- a/include/uapi/linux/uhid.h +++ b/include/uapi/linux/uhid.h @@ -37,6 +37,8 @@ enum uhid_event_type { UHID_GET_REPORT_REPLY, UHID_CREATE2, UHID_INPUT2, + UHID_SET_REPORT, + UHID_SET_REPORT_REPLY, }; struct uhid_create2_req { @@ -84,6 +86,19 @@ struct uhid_get_report_reply_req { __u8 data[UHID_DATA_MAX]; } __attribute__((__packed__)); +struct uhid_set_report_req { + __u32 id; + __u8 rnum; + __u8 rtype; + __u16 size; + __u8 data[UHID_DATA_MAX]; +} __attribute__((__packed__)); + +struct uhid_set_report_reply_req { + __u32 id; + __u16 err; +} __attribute__((__packed__)); + /* * Compat Layer * All these commands and requests are obsolete. You should avoid using them in @@ -165,6 +180,8 @@ struct uhid_event { struct uhid_get_report_reply_req get_report_reply; struct uhid_create2_req create2; struct uhid_input2_req input2; + struct uhid_set_report_req set_report; + struct uhid_set_report_reply_req set_report_reply; } u; } __attribute__((__packed__)); -- cgit v1.2.3 From c2b2f16c5c62583d4f8904e44c4b30c94a01eaf1 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Tue, 29 Jul 2014 17:14:25 +0200 Subject: HID: uhid: report to user-space whether reports are numbered This makes UHID_START include a "dev_flags" field that describes details of the hid-device in the kernel. The first flags we introduce describe whether a given report-type uses numbered reports. This is useful for transport layers that force report-numbers and therefore might have to prefix kernel-provided HID-messages with the report-number. Currently, only HoG needs this and the spec only talks about "global report numbers". That is, it's a global boolean not a per-type boolean. However, given the quirks we already have in kernel-space, a per-type value seems much more appropriate. Signed-off-by: David Herrmann Signed-off-by: Jiri Kosina --- drivers/hid/uhid.c | 21 ++++++++++++++++++++- include/uapi/linux/uhid.h | 11 +++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 19511481a7d3..f6ec5eaf6b89 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -92,8 +92,27 @@ static int uhid_queue_event(struct uhid_device *uhid, __u32 event) static int uhid_hid_start(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; + struct uhid_event *ev; + unsigned long flags; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->type = UHID_START; - return uhid_queue_event(uhid, UHID_START); + if (hid->report_enum[HID_FEATURE_REPORT].numbered) + ev->u.start.dev_flags |= UHID_DEV_NUMBERED_FEATURE_REPORTS; + if (hid->report_enum[HID_OUTPUT_REPORT].numbered) + ev->u.start.dev_flags |= UHID_DEV_NUMBERED_OUTPUT_REPORTS; + if (hid->report_enum[HID_INPUT_REPORT].numbered) + ev->u.start.dev_flags |= UHID_DEV_NUMBERED_INPUT_REPORTS; + + spin_lock_irqsave(&uhid->qlock, flags); + uhid_queue(uhid, ev); + spin_unlock_irqrestore(&uhid->qlock, flags); + + return 0; } static void uhid_hid_stop(struct hid_device *hid) diff --git a/include/uapi/linux/uhid.h b/include/uapi/linux/uhid.h index 62aac0e4edf3..aaa86d6bd1dd 100644 --- a/include/uapi/linux/uhid.h +++ b/include/uapi/linux/uhid.h @@ -54,6 +54,16 @@ struct uhid_create2_req { __u8 rd_data[HID_MAX_DESCRIPTOR_SIZE]; } __attribute__((__packed__)); +enum uhid_dev_flag { + UHID_DEV_NUMBERED_FEATURE_REPORTS = (1ULL << 0), + UHID_DEV_NUMBERED_OUTPUT_REPORTS = (1ULL << 1), + UHID_DEV_NUMBERED_INPUT_REPORTS = (1ULL << 2), +}; + +struct uhid_start_req { + __u64 dev_flags; +}; + #define UHID_DATA_MAX 4096 enum uhid_report_type { @@ -182,6 +192,7 @@ struct uhid_event { struct uhid_input2_req input2; struct uhid_set_report_req set_report; struct uhid_set_report_reply_req set_report_reply; + struct uhid_start_req start; } u; } __attribute__((__packed__)); -- cgit v1.2.3 From 0e227084aee36b3ba27b4fc9cd9e425be6ce2ab8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 12 Aug 2014 20:34:30 +0200 Subject: cfg80211: clarify BSS probe response vs. beacon data There are a few possible cases of where BSS data came from: 1) only a beacon has been received 2) only a probe response has been received 3) the driver didn't report what it received (this happens when using cfg80211_inform_bss[_width]()) 4) both probe response and beacon data has been received Unfortunately, in the userspace API, a few things weren't there: a) there was no way to differentiate cases 1) and 4) above without comparing the data of the IEs b) the TSF was always from the last frame, instead of being exposed for beacon/probe response separately like IEs Fix this by i) exporting a new flag attribute that indicates whether or not probe response data has been received - this addresses (a) ii) exporting a BEACON_TSF attribute that holds the beacon's TSF if a beacon has been received iii) not exporting the beacon attributes in case (3) above as that would just lead userspace into thinking the data actually came from a beacon when that isn't clear To implement this, track inside the IEs struct whether or not it (definitely) came from a beacon. Reported-by: William Seto Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 16 ++++++++++++++-- net/wireless/nl80211.c | 16 ++++++++++++---- net/wireless/scan.c | 6 ++++-- 4 files changed, 32 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7b8dac3efe8f..77b85a89abca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1503,12 +1503,14 @@ enum cfg80211_signal_type { * @tsf: TSF contained in the frame that carried these IEs * @rcu_head: internal use, for freeing * @len: length of the IEs + * @from_beacon: these IEs are known to come from a beacon * @data: IE data */ struct cfg80211_bss_ies { u64 tsf; struct rcu_head rcu_head; int len; + bool from_beacon; u8 data[]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f1db15b9c041..d097568da690 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3055,14 +3055,20 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_BSSID: BSSID of the BSS (6 octets) * @NL80211_BSS_FREQUENCY: frequency in MHz (u32) * @NL80211_BSS_TSF: TSF of the received probe response/beacon (u64) + * (if @NL80211_BSS_PRESP_DATA is present then this is known to be + * from a probe response, otherwise it may be from the same beacon + * that the NL80211_BSS_BEACON_TSF will be from) * @NL80211_BSS_BEACON_INTERVAL: beacon interval of the (I)BSS (u16) * @NL80211_BSS_CAPABILITY: capability field (CPU order, u16) * @NL80211_BSS_INFORMATION_ELEMENTS: binary attribute containing the * raw information elements from the probe response/beacon (bin); - * if the %NL80211_BSS_BEACON_IES attribute is present, the IEs here are - * from a Probe Response frame; otherwise they are from a Beacon frame. + * if the %NL80211_BSS_BEACON_IES attribute is present and the data is + * different then the IEs here are from a Probe Response frame; otherwise + * they are from a Beacon frame. * However, if the driver does not indicate the source of the IEs, these * IEs may be from either frame subtype. + * If present, the @NL80211_BSS_PRESP_DATA attribute indicates that the + * data here is known to be from a probe response, without any heuristics. * @NL80211_BSS_SIGNAL_MBM: signal strength of probe response/beacon * in mBm (100 * dBm) (s32) * @NL80211_BSS_SIGNAL_UNSPEC: signal strength of the probe response/beacon @@ -3074,6 +3080,10 @@ enum nl80211_bss_scan_width { * yet been received * @NL80211_BSS_CHAN_WIDTH: channel width of the control channel * (u32, enum nl80211_bss_scan_width) + * @NL80211_BSS_BEACON_TSF: TSF of the last received beacon (u64) + * (not present if no beacon frame has been received yet) + * @NL80211_BSS_PRESP_DATA: the data in @NL80211_BSS_INFORMATION_ELEMENTS and + * @NL80211_BSS_TSF is known to be from a probe response (flag attribute) * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -3091,6 +3101,8 @@ enum nl80211_bss { NL80211_BSS_SEEN_MS_AGO, NL80211_BSS_BEACON_IES, NL80211_BSS_CHAN_WIDTH, + NL80211_BSS_BEACON_TSF, + NL80211_BSS_PRESP_DATA, /* keep last */ __NL80211_BSS_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index df7b1332a1ec..3011401f52c0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6033,7 +6033,6 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, const struct cfg80211_bss_ies *ies; void *hdr; struct nlattr *bss; - bool tsf = false; ASSERT_WDEV_LOCK(wdev); @@ -6060,18 +6059,27 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; rcu_read_lock(); + /* indicate whether we have probe response data or not */ + if (rcu_access_pointer(res->proberesp_ies) && + nla_put_flag(msg, NL80211_BSS_PRESP_DATA)) + goto fail_unlock_rcu; + + /* this pointer prefers to be pointed to probe response data + * but is always valid + */ ies = rcu_dereference(res->ies); if (ies) { if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) goto fail_unlock_rcu; - tsf = true; if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, ies->len, ies->data)) goto fail_unlock_rcu; } + + /* and this pointer is always (unless driver didn't know) beacon data */ ies = rcu_dereference(res->beacon_ies); - if (ies) { - if (!tsf && nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + if (ies && ies->from_beacon) { + if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf)) goto fail_unlock_rcu; if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, ies->len, ies->data)) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 0798c62e6085..ad1a1a2808d3 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -918,11 +918,12 @@ cfg80211_inform_bss_width(struct wiphy *wiphy, * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = tsf; + ies->from_beacon = false; memcpy(ies->data, ie, ielen); rcu_assign_pointer(tmp.pub.beacon_ies, ies); @@ -982,11 +983,12 @@ cfg80211_inform_bss_width_frame(struct wiphy *wiphy, if (!channel) return NULL; - ies = kmalloc(sizeof(*ies) + ielen, gfp); + ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); + ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control); memcpy(ies->data, mgmt->u.probe_resp.variable, ielen); if (ieee80211_is_probe_resp(mgmt->frame_control)) -- cgit v1.2.3 From f111f780ae1abf4cdc464f24293be90c010a04f6 Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Wed, 20 Aug 2014 22:03:18 +0400 Subject: netfilter: nfnetlink_acct: add filter support to nfacct counter list/reset You can use this to skip accounting objects when listing/resetting via NFNL_MSG_ACCT_GET/NFNL_MSG_ACCT_GET_CTRZERO messages with the NLM_F_DUMP netlink flag. The filtering covers the following cases: 1. No filter specified. In this case, the client will get old behaviour, 2. List/reset counter object only: In this case, you have to use NFACCT_F_QUOTA as mask and value 0. 3. List/reset quota objects only: You have to use NFACCT_F_QUOTA_PKTS as mask and value - the same, for byte based quota mask should be NFACCT_F_QUOTA_BYTES and value - the same. If you want to obtain the object with any quota type (ie. NFACCT_F_QUOTA_PKTS|NFACCT_F_QUOTA_BYTES), you need to perform two dump requests, one to obtain NFACCT_F_QUOTA_PKTS objects and another for NFACCT_F_QUOTA_BYTES. Signed-off-by: Alexey Perevalov Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_acct.h | 8 ++++ net/netfilter/nfnetlink_acct.c | 54 +++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nfnetlink_acct.h b/include/uapi/linux/netfilter/nfnetlink_acct.h index 51404ec19022..f3e34dbbf966 100644 --- a/include/uapi/linux/netfilter/nfnetlink_acct.h +++ b/include/uapi/linux/netfilter/nfnetlink_acct.h @@ -28,9 +28,17 @@ enum nfnl_acct_type { NFACCT_USE, NFACCT_FLAGS, NFACCT_QUOTA, + NFACCT_FILTER, __NFACCT_MAX }; #define NFACCT_MAX (__NFACCT_MAX - 1) +enum nfnl_attr_filter_type { + NFACCT_FILTER_UNSPEC, + NFACCT_FILTER_MASK, + NFACCT_FILTER_VALUE, + __NFACCT_FILTER_MAX +}; +#define NFACCT_FILTER_MAX (__NFACCT_FILTER_MAX - 1) #endif /* _UAPI_NFNL_ACCT_H_ */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 3ea0eacbd970..c18af2f63eef 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -40,6 +40,11 @@ struct nf_acct { char data[0]; }; +struct nfacct_filter { + u32 value; + u32 mask; +}; + #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ @@ -181,6 +186,7 @@ static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nf_acct *cur, *last; + const struct nfacct_filter *filter = cb->data; if (cb->args[2]) return 0; @@ -197,6 +203,10 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) last = NULL; } + + if (filter && (cur->flags & filter->mask) != filter->value) + continue; + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), @@ -211,6 +221,38 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } +static int nfnl_acct_done(struct netlink_callback *cb) +{ + kfree(cb->data); + return 0; +} + +static const struct nla_policy filter_policy[NFACCT_FILTER_MAX + 1] = { + [NFACCT_FILTER_MASK] = { .type = NLA_U32 }, + [NFACCT_FILTER_VALUE] = { .type = NLA_U32 }, +}; + +static struct nfacct_filter * +nfacct_filter_alloc(const struct nlattr * const attr) +{ + struct nfacct_filter *filter; + struct nlattr *tb[NFACCT_FILTER_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy); + if (err < 0) + return ERR_PTR(err); + + filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL); + if (!filter) + return ERR_PTR(-ENOMEM); + + filter->mask = ntohl(nla_get_be32(tb[NFACCT_FILTER_MASK])); + filter->value = ntohl(nla_get_be32(tb[NFACCT_FILTER_VALUE])); + + return filter; +} + static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) @@ -222,7 +264,18 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, + .done = nfnl_acct_done, }; + + if (tb[NFACCT_FILTER]) { + struct nfacct_filter *filter; + + filter = nfacct_filter_alloc(tb[NFACCT_FILTER]); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + c.data = filter; + } return netlink_dump_start(nfnl, skb, nlh, &c); } @@ -314,6 +367,7 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { [NFACCT_PKTS] = { .type = NLA_U64 }, [NFACCT_FLAGS] = { .type = NLA_U32 }, [NFACCT_QUOTA] = { .type = NLA_U64 }, + [NFACCT_FILTER] = {.type = NLA_NESTED }, }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { -- cgit v1.2.3 From 3e8a72d1dae374cf6fc1dba97cec663585845ff9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 27 Aug 2014 17:04:46 -0700 Subject: net: dsa: reduce number of protocol hooks DSA is currently registering one packet_type function per EtherType it needs to intercept in the receive path of a DSA-enabled Ethernet device. Right now we have three of them: trailer, DSA and eDSA, and there might be more in the future, this will not scale to the addition of new protocols. This patch proceeds with adding a new layer of abstraction and two new functions: dsa_switch_rcv() which will dispatch into the tag-protocol specific receive function implemented by net/dsa/tag_*.c dsa_slave_xmit() which will dispatch into the tag-protocol specific transmit function implemented by net/dsa/tag_*.c When we do create the per-port slave network devices, we iterate over the switch protocol to assign the DSA-specific receive and transmit operations. A new fake ethertype value is used: ETH_P_XDSA to illustrate the fact that this is no longer going to look like ETH_P_DSA or ETH_P_TRAILER like it used to be. This allows us to greatly simplify the check in eth_type_trans() and always override the skb->protocol with ETH_P_XDSA for Ethernet switches tagged protocol, while also reducing the number repetitive slave netdevice_ops assignments. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 28 ++++++++++++--------------- include/net/dsa.h | 20 +++---------------- include/uapi/linux/if_ether.h | 1 + net/dsa/dsa.c | 39 ++++++++++++++++++++----------------- net/dsa/dsa_priv.h | 9 +++------ net/dsa/slave.c | 45 ++++++++++++++----------------------------- net/dsa/tag_dsa.c | 8 ++++---- net/dsa/tag_edsa.c | 8 ++++---- net/dsa/tag_trailer.c | 8 ++++---- net/ethernet/eth.c | 7 ++----- 10 files changed, 68 insertions(+), 105 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 039b23786c22..1875dc71422a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1781,24 +1781,13 @@ void dev_net_set(struct net_device *dev, struct net *net) #endif } -static inline bool netdev_uses_dsa_tags(struct net_device *dev) +static inline bool netdev_uses_dsa(struct net_device *dev) { -#ifdef CONFIG_NET_DSA_TAG_DSA - if (dev->dsa_ptr != NULL) - return dsa_uses_dsa_tags(dev->dsa_ptr); -#endif - - return 0; -} - -static inline bool netdev_uses_trailer_tags(struct net_device *dev) -{ -#ifdef CONFIG_NET_DSA_TAG_TRAILER - if (dev->dsa_ptr != NULL) - return dsa_uses_trailer_tags(dev->dsa_ptr); +#ifdef CONFIG_NET_DSA + return dev->dsa_ptr != NULL; +#else + return false; #endif - - return 0; } /** @@ -1933,6 +1922,13 @@ struct udp_offload { struct offload_callbacks callbacks; }; +struct dsa_device_ops { + netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + int (*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev); +}; + + /* often modified stats are per cpu, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64 rx_packets; diff --git a/include/net/dsa.h b/include/net/dsa.h index 6efce384451e..6e26f1e4d8ce 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -59,6 +59,8 @@ struct dsa_platform_data { struct dsa_chip_data *chip; }; +struct dsa_device_ops; + struct dsa_switch_tree { /* * Configuration data for the platform device that owns @@ -71,6 +73,7 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; + const struct dsa_device_ops *ops; __be16 tag_protocol; /* @@ -186,21 +189,4 @@ static inline void *ds_to_priv(struct dsa_switch *ds) return (void *)(ds + 1); } -/* - * The original DSA tag format and some other tag formats have no - * ethertype, which means that we need to add a little hack to the - * networking receive path to make sure that received frames get - * the right ->protocol assigned to them when one of those tag - * formats is in use. - */ -static inline bool dsa_uses_dsa_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_DSA)); -} - -static inline bool dsa_uses_trailer_tags(struct dsa_switch_tree *dst) -{ - return !!(dst->tag_protocol == htons(ETH_P_TRAILER)); -} - #endif diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 0f8210b8e0bc..aa63ed023c2b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -128,6 +128,7 @@ #define ETH_P_PHONET 0x00F5 /* Nokia Phonet frames */ #define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ #define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ +#define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ /* * This is an Ethernet frame header. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0a49632fac47..92e71d2a2ccd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -608,6 +608,24 @@ static void dsa_shutdown(struct platform_device *pdev) { } +static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + + if (unlikely(dst == NULL)) { + kfree_skb(skb); + return 0; + } + + return dst->ops->rcv(skb, dev, pt, orig_dev); +} + +struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, +}; + static const struct of_device_id dsa_of_match_table[] = { { .compatible = "marvell,dsa", }, {} @@ -633,30 +651,15 @@ static int __init dsa_init_module(void) if (rc) return rc; -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_add_pack(&dsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_add_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_add_pack(&trailer_packet_type); -#endif + dev_add_pack(&dsa_pack_type); + return 0; } module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { -#ifdef CONFIG_NET_DSA_TAG_TRAILER - dev_remove_pack(&trailer_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA - dev_remove_pack(&edsa_packet_type); -#endif -#ifdef CONFIG_NET_DSA_TAG_DSA - dev_remove_pack(&dsa_packet_type); -#endif + dev_remove_pack(&dsa_pack_type); platform_driver_unregister(&dsa_driver); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d4cf5cc747e3..218d75d16f6f 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -45,16 +45,13 @@ struct net_device *dsa_slave_create(struct dsa_switch *ds, int port, char *name); /* tag_dsa.c */ -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type dsa_packet_type; +extern const struct dsa_device_ops dsa_netdev_ops; /* tag_edsa.c */ -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type edsa_packet_type; +extern const struct dsa_device_ops edsa_netdev_ops; /* tag_trailer.c */ -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev); -extern struct packet_type trailer_packet_type; +extern const struct dsa_device_ops trailer_netdev_ops; #endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 45a1e34c89e0..ad1a913533aa 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -171,6 +171,14 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } +static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch_tree *dst = p->parent->dst; + + return dst->ops->xmit(skb, dev); +} + /* ethtool operations *******************************************************/ static int @@ -293,42 +301,16 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_sset_count = dsa_slave_get_sset_count, }; -#ifdef CONFIG_NET_DSA_TAG_DSA -static const struct net_device_ops dsa_netdev_ops = { +static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_init = dsa_slave_init, .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, - .ndo_start_xmit = dsa_xmit, + .ndo_start_xmit = dsa_slave_xmit, .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_do_ioctl = dsa_slave_ioctl, }; -#endif -#ifdef CONFIG_NET_DSA_TAG_EDSA -static const struct net_device_ops edsa_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = edsa_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif -#ifdef CONFIG_NET_DSA_TAG_TRAILER -static const struct net_device_ops trailer_netdev_ops = { - .ndo_init = dsa_slave_init, - .ndo_open = dsa_slave_open, - .ndo_stop = dsa_slave_close, - .ndo_start_xmit = trailer_xmit, - .ndo_change_rx_flags = dsa_slave_change_rx_flags, - .ndo_set_rx_mode = dsa_slave_set_rx_mode, - .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_do_ioctl = dsa_slave_ioctl, -}; -#endif /* slave device setup *******************************************************/ struct net_device * @@ -349,21 +331,22 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); slave_dev->tx_queue_len = 0; + slave_dev->netdev_ops = &dsa_slave_netdev_ops; switch (ds->dst->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case htons(ETH_P_DSA): - slave_dev->netdev_ops = &dsa_netdev_ops; + ds->dst->ops = &dsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_EDSA case htons(ETH_P_EDSA): - slave_dev->netdev_ops = &edsa_netdev_ops; + ds->dst->ops = &edsa_netdev_ops; break; #endif #ifdef CONFIG_NET_DSA_TAG_TRAILER case htons(ETH_P_TRAILER): - slave_dev->netdev_ops = &trailer_netdev_ops; + ds->dst->ops = &trailer_netdev_ops; break; #endif default: diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index cacce1e22f9c..d7dbc5bda5c0 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -16,7 +16,7 @@ #define DSA_HLEN 4 -netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; @@ -186,7 +186,7 @@ out: return 0; } -struct packet_type dsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_DSA), - .func = dsa_rcv, +const struct dsa_device_ops dsa_netdev_ops = { + .xmit = dsa_xmit, + .rcv = dsa_rcv, }; diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index e70c43c25e64..6b30abe89183 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -17,7 +17,7 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 -netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; @@ -205,7 +205,7 @@ out: return 0; } -struct packet_type edsa_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_EDSA), - .func = edsa_rcv, +const struct dsa_device_ops edsa_netdev_ops = { + .xmit = edsa_xmit, + .rcv = edsa_rcv, }; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 94bc260d015d..5fe9444842c5 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -14,7 +14,7 @@ #include #include "dsa_priv.h" -netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; @@ -114,7 +114,7 @@ out: return 0; } -struct packet_type trailer_packet_type __read_mostly = { - .type = cpu_to_be16(ETH_P_TRAILER), - .func = trailer_rcv, +const struct dsa_device_ops trailer_netdev_ops = { + .xmit = trailer_xmit, + .rcv = trailer_rcv, }; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index f405e0592407..5cebca134585 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -181,11 +181,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. */ - if (unlikely(netdev_uses_dsa_tags(dev))) - return htons(ETH_P_DSA); - - if (unlikely(netdev_uses_trailer_tags(dev))) - return htons(ETH_P_TRAILER); + if (unlikely(netdev_uses_dsa(dev))) + return htons(ETH_P_XDSA); if (likely(ntohs(eth->h_proto) >= ETH_P_802_3_MIN)) return eth->h_proto; -- cgit v1.2.3 From 0f8a4de3e088797576ac76200b634b802e5c7781 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 26 Aug 2014 14:00:37 +0200 Subject: KVM: Unconditionally export KVM_CAP_READONLY_MEM The idea between capabilities and the KVM_CHECK_EXTENSION ioctl is that userspace can, at run-time, determine if a feature is supported or not. This allows KVM to being supporting a new feature with a new kernel version without any need to update user space. Unfortunately, since the definition of KVM_CAP_READONLY_MEM was guarded by #ifdef __KVM_HAVE_READONLY_MEM, such discovery still required a user space update. Therefore, unconditionally export KVM_CAP_READONLY_MEM and change the in-kernel conditional to rely on __KVM_HAVE_READONLY_MEM. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 2 -- virt/kvm/kvm_main.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cf3a2ff440e4..90d3edab839c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -738,9 +738,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_GET_SMMU_INFO 78 #define KVM_CAP_S390_COW 79 #define KVM_CAP_PPC_ALLOC_HTAB 80 -#ifdef __KVM_HAVE_READONLY_MEM #define KVM_CAP_READONLY_MEM 81 -#endif #define KVM_CAP_IRQFD_RESAMPLE 82 #define KVM_CAP_PPC_BOOKE_WATCHDOG 83 #define KVM_CAP_PPC_HTAB_FD 84 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a0817ee996e..1d03967def40 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -708,7 +708,7 @@ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; -#ifdef KVM_CAP_READONLY_MEM +#ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif -- cgit v1.2.3 From 44b5ce73c99c389817be71b9161bceb197d40ecb Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 26 Aug 2014 14:00:38 +0200 Subject: KVM: Unconditionally export KVM_CAP_USER_NMI The idea between capabilities and the KVM_CHECK_EXTENSION ioctl is that userspace can, at run-time, determine if a feature is supported or not. This allows KVM to being supporting a new feature with a new kernel version without any need to update user space. Unfortunately, since the definition of KVM_CAP_USER_NMI was guarded by #ifdef __KVM_HAVE_USER_NMI, such discovery still required a user space update. Therefore, unconditionally export KVM_CAP_USER_NMI and change the the typo in the comment for the IOCTL number definition as well. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 90d3edab839c..0695a1e3e332 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -654,9 +654,7 @@ struct kvm_ppc_smmu_info { #endif /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 -#ifdef __KVM_HAVE_USER_NMI #define KVM_CAP_USER_NMI 22 -#endif #ifdef __KVM_HAVE_GUEST_DEBUG #define KVM_CAP_SET_GUEST_DEBUG 23 #endif @@ -1091,7 +1089,7 @@ struct kvm_s390_ucas_mapping { #define KVM_S390_INITIAL_RESET _IO(KVMIO, 0x97) #define KVM_GET_MP_STATE _IOR(KVMIO, 0x98, struct kvm_mp_state) #define KVM_SET_MP_STATE _IOW(KVMIO, 0x99, struct kvm_mp_state) -/* Available with KVM_CAP_NMI */ +/* Available with KVM_CAP_USER_NMI */ #define KVM_NMI _IO(KVMIO, 0x9a) /* Available with KVM_CAP_SET_GUEST_DEBUG */ #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) -- cgit v1.2.3 From 880a6fab8f6ba5b5abe59ea68533202ddea1012c Mon Sep 17 00:00:00 2001 From: Christophe Gouault Date: Fri, 29 Aug 2014 16:16:05 +0200 Subject: xfrm: configure policy hash table thresholds by netlink Enable to specify local and remote prefix length thresholds for the policy hash table via a netlink XFRM_MSG_NEWSPDINFO message. prefix length thresholds are specified by XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH optional attributes (struct xfrmu_spdhthresh). example: struct xfrmu_spdhthresh thresh4 = { .lbits = 0; .rbits = 24; }; struct xfrmu_spdhthresh thresh6 = { .lbits = 0; .rbits = 56; }; struct nlmsghdr *hdr; struct nl_msg *msg; msg = nlmsg_alloc(); hdr = nlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, XFRMA_SPD_IPV4_HTHRESH, sizeof(__u32), NLM_F_REQUEST); nla_put(msg, XFRMA_SPD_IPV4_HTHRESH, sizeof(thresh4), &thresh4); nla_put(msg, XFRMA_SPD_IPV6_HTHRESH, sizeof(thresh6), &thresh6); nla_send_auto(sk, msg); The numbers are the policy selector minimum prefix lengths to put a policy in the hash table. - lbits is the local threshold (source address for out policies, destination address for in and fwd policies). - rbits is the remote threshold (destination address for out policies, source address for in and fwd policies). The default values are: XFRMA_SPD_IPV4_HTHRESH: 32 32 XFRMA_SPD_IPV6_HTHRESH: 128 128 Dynamic re-building of the SPD is performed when the thresholds values are changed. The current thresholds can be read via a XFRM_MSG_GETSPDINFO request: the kernel replies to XFRM_MSG_GETSPDINFO requests by an XFRM_MSG_NEWSPDINFO message, with both attributes XFRMA_SPD_IPV4_HTHRESH and XFRMA_SPD_IPV6_HTHRESH. Signed-off-by: Christophe Gouault Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 10 ++++++ include/net/xfrm.h | 1 + include/uapi/linux/xfrm.h | 7 ++++ net/xfrm/xfrm_policy.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 80 +++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 182 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 41902a8103bd..9da798256f0e 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -19,6 +19,15 @@ struct xfrm_policy_hash { u8 sbits6; }; +struct xfrm_policy_hthresh { + struct work_struct work; + seqlock_t lock; + u8 lbits4; + u8 rbits4; + u8 lbits6; + u8 rbits6; +}; + struct netns_xfrm { struct list_head state_all; /* @@ -45,6 +54,7 @@ struct netns_xfrm { struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX * 2]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; + struct xfrm_policy_hthresh policy_hthresh; struct sock *nlsk; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 721e9c3b11bd..dc4865e90fe4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1591,6 +1591,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8, int dir, u32 id, int delete, int *err); int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); +void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); int verify_spi_info(u8 proto, u32 min, u32 max); int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 25e5dd916ba4..02d5125a5ee8 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -328,6 +328,8 @@ enum xfrm_spdattr_type_t { XFRMA_SPD_UNSPEC, XFRMA_SPD_INFO, XFRMA_SPD_HINFO, + XFRMA_SPD_IPV4_HTHRESH, + XFRMA_SPD_IPV6_HTHRESH, __XFRMA_SPD_MAX #define XFRMA_SPD_MAX (__XFRMA_SPD_MAX - 1) @@ -347,6 +349,11 @@ struct xfrmu_spdhinfo { __u32 spdhmcnt; }; +struct xfrmu_spdhthresh { + __u8 lbits; + __u8 rbits; +}; + struct xfrm_usersa_info { struct xfrm_selector sel; struct xfrm_id id; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e6ff7b4046ea..55bcb8604bc6 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -566,6 +566,86 @@ static void xfrm_hash_resize(struct work_struct *work) mutex_unlock(&hash_resize_mutex); } +static void xfrm_hash_rebuild(struct work_struct *work) +{ + struct net *net = container_of(work, struct net, + xfrm.policy_hthresh.work); + unsigned int hmask; + struct xfrm_policy *pol; + struct xfrm_policy *policy; + struct hlist_head *chain; + struct hlist_head *odst; + struct hlist_node *newpos; + int i; + int dir; + unsigned seq; + u8 lbits4, rbits4, lbits6, rbits6; + + mutex_lock(&hash_resize_mutex); + + /* read selector prefixlen thresholds */ + do { + seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + lbits4 = net->xfrm.policy_hthresh.lbits4; + rbits4 = net->xfrm.policy_hthresh.rbits4; + lbits6 = net->xfrm.policy_hthresh.lbits6; + rbits6 = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); + + write_lock_bh(&net->xfrm.xfrm_policy_lock); + + /* reset the bydst and inexact table in all directions */ + for (dir = 0; dir < XFRM_POLICY_MAX * 2; dir++) { + INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); + hmask = net->xfrm.policy_bydst[dir].hmask; + odst = net->xfrm.policy_bydst[dir].table; + for (i = hmask; i >= 0; i--) + INIT_HLIST_HEAD(odst + i); + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { + /* dir out => dst = remote, src = local */ + net->xfrm.policy_bydst[dir].dbits4 = rbits4; + net->xfrm.policy_bydst[dir].sbits4 = lbits4; + net->xfrm.policy_bydst[dir].dbits6 = rbits6; + net->xfrm.policy_bydst[dir].sbits6 = lbits6; + } else { + /* dir in/fwd => dst = local, src = remote */ + net->xfrm.policy_bydst[dir].dbits4 = lbits4; + net->xfrm.policy_bydst[dir].sbits4 = rbits4; + net->xfrm.policy_bydst[dir].dbits6 = lbits6; + net->xfrm.policy_bydst[dir].sbits6 = rbits6; + } + } + + /* re-insert all policies by order of creation */ + list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { + newpos = NULL; + chain = policy_hash_bysel(net, &policy->selector, + policy->family, + xfrm_policy_id2dir(policy->index)); + hlist_for_each_entry(pol, chain, bydst) { + if (policy->priority >= pol->priority) + newpos = &pol->bydst; + else + break; + } + if (newpos) + hlist_add_behind(&policy->bydst, newpos); + else + hlist_add_head(&policy->bydst, chain); + } + + write_unlock_bh(&net->xfrm.xfrm_policy_lock); + + mutex_unlock(&hash_resize_mutex); +} + +void xfrm_policy_hash_rebuild(struct net *net) +{ + schedule_work(&net->xfrm.policy_hthresh.work); +} +EXPORT_SYMBOL(xfrm_policy_hash_rebuild); + /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) @@ -2872,9 +2952,16 @@ static int __net_init xfrm_policy_init(struct net *net) htab->dbits6 = 128; htab->sbits6 = 128; } + net->xfrm.policy_hthresh.lbits4 = 32; + net->xfrm.policy_hthresh.rbits4 = 32; + net->xfrm.policy_hthresh.lbits6 = 128; + net->xfrm.policy_hthresh.rbits6 = 128; + + seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); + INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); if (net_eq(net, &init_net)) register_netdevice_notifier(&xfrm_dev_notifier); return 0; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d4db6ebb089d..eaf8a8f1cbe8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -964,7 +964,9 @@ static inline size_t xfrm_spdinfo_msgsize(void) { return NLMSG_ALIGN(4) + nla_total_size(sizeof(struct xfrmu_spdinfo)) - + nla_total_size(sizeof(struct xfrmu_spdhinfo)); + + nla_total_size(sizeof(struct xfrmu_spdhinfo)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)) + + nla_total_size(sizeof(struct xfrmu_spdhthresh)); } static int build_spdinfo(struct sk_buff *skb, struct net *net, @@ -973,9 +975,11 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, struct xfrmk_spdinfo si; struct xfrmu_spdinfo spc; struct xfrmu_spdhinfo sph; + struct xfrmu_spdhthresh spt4, spt6; struct nlmsghdr *nlh; int err; u32 *f; + unsigned lseq; nlh = nlmsg_put(skb, portid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); if (nlh == NULL) /* shouldn't really happen ... */ @@ -993,9 +997,22 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, sph.spdhcnt = si.spdhcnt; sph.spdhmcnt = si.spdhmcnt; + do { + lseq = read_seqbegin(&net->xfrm.policy_hthresh.lock); + + spt4.lbits = net->xfrm.policy_hthresh.lbits4; + spt4.rbits = net->xfrm.policy_hthresh.rbits4; + spt6.lbits = net->xfrm.policy_hthresh.lbits6; + spt6.rbits = net->xfrm.policy_hthresh.rbits6; + } while (read_seqretry(&net->xfrm.policy_hthresh.lock, lseq)); + err = nla_put(skb, XFRMA_SPD_INFO, sizeof(spc), &spc); if (!err) err = nla_put(skb, XFRMA_SPD_HINFO, sizeof(sph), &sph); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV4_HTHRESH, sizeof(spt4), &spt4); + if (!err) + err = nla_put(skb, XFRMA_SPD_IPV6_HTHRESH, sizeof(spt6), &spt6); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -1004,6 +1021,51 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, return nlmsg_end(skb, nlh); } +static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrmu_spdhthresh *thresh4 = NULL; + struct xfrmu_spdhthresh *thresh6 = NULL; + + /* selector prefixlen thresholds to hash policies */ + if (attrs[XFRMA_SPD_IPV4_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh4)) + return -EINVAL; + thresh4 = nla_data(rta); + if (thresh4->lbits > 32 || thresh4->rbits > 32) + return -EINVAL; + } + if (attrs[XFRMA_SPD_IPV6_HTHRESH]) { + struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH]; + + if (nla_len(rta) < sizeof(*thresh6)) + return -EINVAL; + thresh6 = nla_data(rta); + if (thresh6->lbits > 128 || thresh6->rbits > 128) + return -EINVAL; + } + + if (thresh4 || thresh6) { + write_seqlock(&net->xfrm.policy_hthresh.lock); + if (thresh4) { + net->xfrm.policy_hthresh.lbits4 = thresh4->lbits; + net->xfrm.policy_hthresh.rbits4 = thresh4->rbits; + } + if (thresh6) { + net->xfrm.policy_hthresh.lbits6 = thresh6->lbits; + net->xfrm.policy_hthresh.rbits6 = thresh6->rbits; + } + write_sequnlock(&net->xfrm.policy_hthresh.lock); + + xfrm_policy_hash_rebuild(net); + } + + return 0; +} + static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -2274,6 +2336,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), }; @@ -2308,10 +2371,17 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, }; +static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { + [XFRMA_SPD_IPV4_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, + [XFRMA_SPD_IPV6_HTHRESH] = { .len = sizeof(struct xfrmu_spdhthresh) }, +}; + static const struct xfrm_link { int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); + const struct nla_policy *nla_pol; + int nla_max; } xfrm_dispatch[XFRM_NR_MSGTYPES] = { [XFRM_MSG_NEWSA - XFRM_MSG_BASE] = { .doit = xfrm_add_sa }, [XFRM_MSG_DELSA - XFRM_MSG_BASE] = { .doit = xfrm_del_sa }, @@ -2335,6 +2405,9 @@ static const struct xfrm_link { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_set_spdinfo, + .nla_pol = xfrma_spd_policy, + .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; @@ -2371,8 +2444,9 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } } - err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, XFRMA_MAX, - xfrma_policy); + err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, + link->nla_max ? : XFRMA_MAX, + link->nla_pol ? : xfrma_policy); if (err < 0) return err; -- cgit v1.2.3 From 1df22b4ea9d91b01267fb61c155c31fb65d6b8a0 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 27 Aug 2014 22:58:45 +0200 Subject: usb: gadget: f_fs: add usb_functionfs_descs_head_v2 structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The structure can be used with user space tools that use the new functionfs description format, for example as follows: static const struct { struct usb_functionfs_descs_head_v2 header; __le32 fs_count; __le32 hs_count; struct { … } fs_desc; struct { … } hs_desc; } descriptors = { .header = { .magic = cpu_to_le32(FUNCTIONFS_DESCRIPTORS_MAGIC_V2), .length = cpu_to_le32(sizeof(descriptors)), .flags = cpu_to_le32(FUNCTIONFS_HAS_FS_DESC | FUNCTIONFS_HAS_HS_DESC) }, .fs_count = cpu_to_le32(X), .fs_desc = { … }, .hs_count = cpu_to_le32(Y), .hs_desc = { … } }; Signed-off-by: Michal Nazarewicz Signed-off-by: Felipe Balbi --- include/uapi/linux/usb/functionfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h index 0154b2859fd7..3ca03de7c2a8 100644 --- a/include/uapi/linux/usb/functionfs.h +++ b/include/uapi/linux/usb/functionfs.h @@ -32,6 +32,16 @@ struct usb_endpoint_descriptor_no_audio { __u8 bInterval; } __attribute__((packed)); +struct usb_functionfs_descs_head_v2 { + __le32 magic; + __le32 length; + __le32 flags; + /* + * __le32 fs_count, hs_count, fs_count; must be included manually in + * the structure taking flags into consideration. + */ +} __attribute__((packed)); + /* Legacy format, deprecated as of 3.14. */ struct usb_functionfs_descs_head { __le32 magic; -- cgit v1.2.3 From 51c208c746e800dba37d1a54d3c5e601630266c4 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 27 Aug 2014 22:58:46 +0200 Subject: tools: ffs-test: convert to new descriptor format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit [ac8dde11: “Add flags to descriptors block”] functionfs supports a new, more powerful and extensible, descriptor format. Since ffs-test is probably the first thing users of the functionfs interface see when they start writing functionfs user space daemons, convert it to use the new format thus promoting it. Signed-off-by: Michal Nazarewicz Signed-off-by: Felipe Balbi --- include/uapi/linux/usb/functionfs.h | 2 +- tools/usb/ffs-test.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h index 3ca03de7c2a8..6d2a16b834bf 100644 --- a/include/uapi/linux/usb/functionfs.h +++ b/include/uapi/linux/usb/functionfs.h @@ -102,7 +102,7 @@ struct usb_ext_prop_desc { * structure. Any flags that are not recognised cause the whole block to be * rejected with -ENOSYS. * - * Legacy descriptors format: + * Legacy descriptors format (deprecated as of 3.14): * * | off | name | type | description | * |-----+-----------+--------------+--------------------------------------| diff --git a/tools/usb/ffs-test.c b/tools/usb/ffs-test.c index a87e99f37c52..708d317b0f37 100644 --- a/tools/usb/ffs-test.c +++ b/tools/usb/ffs-test.c @@ -1,5 +1,5 @@ /* - * ffs-test.c.c -- user mode filesystem api for usb composite function + * ffs-test.c -- user mode filesystem api for usb composite function * * Copyright (C) 2010 Samsung Electronics * Author: Michal Nazarewicz @@ -106,7 +106,9 @@ static void _msg(unsigned level, const char *fmt, ...) /******************** Descriptors and Strings *******************************/ static const struct { - struct usb_functionfs_descs_head header; + struct usb_functionfs_descs_head_v2 header; + __le32 fs_count; + __le32 hs_count; struct { struct usb_interface_descriptor intf; struct usb_endpoint_descriptor_no_audio sink; @@ -114,11 +116,12 @@ static const struct { } __attribute__((packed)) fs_descs, hs_descs; } __attribute__((packed)) descriptors = { .header = { - .magic = cpu_to_le32(FUNCTIONFS_DESCRIPTORS_MAGIC), + .magic = cpu_to_le32(FUNCTIONFS_DESCRIPTORS_MAGIC_V2), + .flags = cpu_to_le32(FUNCTIONFS_HAS_FS_DESC | + FUNCTIONFS_HAS_HS_DESC), .length = cpu_to_le32(sizeof descriptors), - .fs_count = cpu_to_le32(3), - .hs_count = cpu_to_le32(3), }, + .fs_count = cpu_to_le32(3), .fs_descs = { .intf = { .bLength = sizeof descriptors.fs_descs.intf, @@ -142,6 +145,7 @@ static const struct { /* .wMaxPacketSize = autoconfiguration (kernel) */ }, }, + .hs_count = cpu_to_le32(3), .hs_descs = { .intf = { .bLength = sizeof descriptors.fs_descs.intf, -- cgit v1.2.3 From eadf9e26fab7f9841adcc36f3559dbce7604fcd5 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Thu, 21 Aug 2014 16:49:16 -0300 Subject: [media] videodev2.h: add __user to v4l2_ext_control pointers These are not copied to kernel space by video_usercopy, so mark them as __user. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 778a3298fb34..0b1ba5c6a8d2 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1285,11 +1285,11 @@ struct v4l2_ext_control { union { __s32 value; __s64 value64; - char *string; - __u8 *p_u8; - __u16 *p_u16; - __u32 *p_u32; - void *ptr; + char __user *string; + __u8 __user *p_u8; + __u16 __user *p_u16; + __u32 __user *p_u32; + void __user *ptr; }; } __attribute__ ((packed)); -- cgit v1.2.3 From 1c7e23bf50264a251de53ad9fb1604683b801258 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 3 Sep 2014 15:25:00 +0300 Subject: nl80211: Allow declaring RRM-related features Radio Resource Measurement (RRM) is a bundle of features which will require the entire stack to participate. In this patch, the driver is given the opportunity to advertise the device's support for these RRM-related features, using feature flags: 1. Support for Quiet IEs. 2. Support for adding DS Parameter Set IE to probe requests. 3. Support for adding WFA TPC Report IE to probe requests. 4. Support for inserting tx power value to tx-ed packets at a fixed offset. This is used in action frames, such as RRM's Link Measurement Report, where the actual tx power should be reported in the frame. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d097568da690..c166d3d23e55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3968,6 +3968,16 @@ enum nl80211_ap_sme_features { * @NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE: This driver supports dynamic * channel bandwidth change (e.g., HT 20 <-> 40 MHz channel) during the * lifetime of a BSS. + * @NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES: This device adds a DS Parameter + * Set IE to probe requests. + * @NL80211_FEATURE_WFA_TPC_IE_IN_PROBES: This device adds a WFA TPC Report IE + * to probe requests. + * @NL80211_FEATURE_QUIET: This device, in client mode, supports Quiet Period + * requests sent to it by an AP. + * @NL80211_FEATURE_TX_POWER_INSERTION: This device is capable of inserting the + * current tx power value into the TPC Report IE in the spectrum + * management TPC Report action frame, and in the Radio Measurement Link + * Measurement Report action frame. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -3989,6 +3999,10 @@ enum nl80211_feature_flags { NL80211_FEATURE_USERSPACE_MPM = 1 << 16, NL80211_FEATURE_ACTIVE_MONITOR = 1 << 17, NL80211_FEATURE_AP_MODE_CHAN_WIDTH_CHANGE = 1 << 18, + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES = 1 << 19, + NL80211_FEATURE_WFA_TPC_IE_IN_PROBES = 1 << 20, + NL80211_FEATURE_QUIET = 1 << 21, + NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, }; /** -- cgit v1.2.3 From bab5ab7d2a5466406e8003d038cc7ce6b2d5d804 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 3 Sep 2014 15:25:01 +0300 Subject: nl80211: Add flag attribute for RRM connections Add a flag attribute to use in associations, for tagging the target connection as supporting RRM. It is the responsibility of upper layers to set this flag only if both the underlying device, and the target network indeed support RRM. To be used in ASSOCIATE and CONNECT commands. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 13 +++++++++++++ net/wireless/nl80211.c | 17 +++++++++++++++++ 3 files changed, 32 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6be68bb31918..a887eeb5b31e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1608,10 +1608,12 @@ struct cfg80211_auth_request { * * @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n) * @ASSOC_REQ_DISABLE_VHT: Disable VHT + * @ASSOC_REQ_USE_RRM: Declare RRM capability in this association */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), ASSOC_REQ_DISABLE_VHT = BIT(1), + ASSOC_REQ_USE_RRM = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c166d3d23e55..de69d3df5e55 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1594,6 +1594,17 @@ enum nl80211_commands { * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. * + * @NL80211_ATTR_USE_RRM: flag for indicating whether the current connection + * shall support Radio Resource Measurements (11k). This attribute can be + * used with %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests. + * User space applications are expected to use this flag only if the + * underlying device supports these minimal RRM features: + * %NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES, + * %NL80211_FEATURE_QUIET, + * If this flag is used, driver must add the Power Capabilities IE to the + * association request. In addition, it must also set the RRM capability + * flag in the association request's Capability Info field. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1936,6 +1947,8 @@ enum nl80211_attrs { NL80211_ATTR_TDLS_INITIATOR, + NL80211_ATTR_USE_RRM, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d876146498dd..459dc2769d0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -389,6 +389,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, + [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -6584,6 +6585,14 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) sizeof(req.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + req.flags |= ASSOC_REQ_USE_RRM; + } + err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); if (!err) { wdev_lock(dev->ieee80211_ptr); @@ -7241,6 +7250,14 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.vht_capa)); } + if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) { + if (!(rdev->wiphy.features & + NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) || + !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) + return -EINVAL; + connect.flags |= ASSOC_REQ_USE_RRM; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); wdev_unlock(dev->ieee80211_ptr); -- cgit v1.2.3 From 3057dbfdab1b86a77ed6d512fc857b032f78663b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 4 Sep 2014 23:57:40 +0200 Subject: cfg80211: enable dynack through nl80211 Enable ACK timeout estimation algorithm (dynack) using mac80211 set_coverage_class API. Dynack is activated passing coverage class equals to -1 to lower drivers and it is automatically disabled setting valid value for coverage class. Define NL80211_ATTR_WIPHY_DYN_ACK flag attribute to enable dynack from userspace. In order to activate dynack NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower drivers to indicate dynack capability. Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 12 ++++++++++++ net/wireless/nl80211.c | 11 +++++++++++ 3 files changed, 25 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a887eeb5b31e..0d17ec9df692 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1805,6 +1805,7 @@ struct cfg80211_connect_params { * @WIPHY_PARAM_FRAG_THRESHOLD: wiphy->frag_threshold has changed * @WIPHY_PARAM_RTS_THRESHOLD: wiphy->rts_threshold has changed * @WIPHY_PARAM_COVERAGE_CLASS: coverage class changed + * @WIPHY_PARAM_DYN_ACK: dynack has been enabled */ enum wiphy_params_flags { WIPHY_PARAM_RETRY_SHORT = 1 << 0, @@ -1812,6 +1813,7 @@ enum wiphy_params_flags { WIPHY_PARAM_FRAG_THRESHOLD = 1 << 2, WIPHY_PARAM_RTS_THRESHOLD = 1 << 3, WIPHY_PARAM_COVERAGE_CLASS = 1 << 4, + WIPHY_PARAM_DYN_ACK = 1 << 5, }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index de69d3df5e55..29c4399e08b9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1605,6 +1605,12 @@ enum nl80211_commands { * association request. In addition, it must also set the RRM capability * flag in the association request's Capability Info field. * + * @NL80211_ATTR_WIPHY_DYN_ACK: flag attribute used to enable ACK timeout + * estimation algorithm (dynack). In order to activate dynack + * %NL80211_FEATURE_ACKTO_ESTIMATION feature flag must be set by lower + * drivers to indicate dynack capability. Dynack is automatically disabled + * setting valid value for coverage class. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1949,6 +1955,8 @@ enum nl80211_attrs { NL80211_ATTR_USE_RRM, + NL80211_ATTR_WIPHY_DYN_ACK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3991,6 +3999,9 @@ enum nl80211_ap_sme_features { * current tx power value into the TPC Report IE in the spectrum * management TPC Report action frame, and in the Radio Measurement Link * Measurement Report action frame. + * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout + * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used + * to enable dynack. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4016,6 +4027,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_WFA_TPC_IE_IN_PROBES = 1 << 20, NL80211_FEATURE_QUIET = 1 << 21, NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, + NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 459dc2769d0e..cf178d2b621d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -226,6 +226,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_WIPHY_FRAG_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RTS_THRESHOLD] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_COVERAGE_CLASS] = { .type = NLA_U8 }, + [NL80211_ATTR_WIPHY_DYN_ACK] = { .type = NLA_FLAG }, [NL80211_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, @@ -2239,11 +2240,21 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) + return -EINVAL; + coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); changed |= WIPHY_PARAM_COVERAGE_CLASS; } + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) + return -EOPNOTSUPP; + + changed |= WIPHY_PARAM_DYN_ACK; + } + if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; -- cgit v1.2.3 From f0db9b073415848709dd59a6394969882f517da9 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan <_govind@gmx.com> Date: Wed, 3 Sep 2014 03:17:20 +0530 Subject: ethtool: Add generic options for tunables This patch adds new ethtool cmd, ETHTOOL_GTUNABLE & ETHTOOL_STUNABLE for getting tunable values from driver. Add get_tunable and set_tunable to ethtool_ops. Driver implements these functions for getting/setting tunable value. Signed-off-by: Govindarajulu Varadarajan <_govind@gmx.com> Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 +++ include/uapi/linux/ethtool.h | 28 +++++++++++++++ net/core/ethtool.c | 81 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e658229fee39..c1a2d60dfb82 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -257,6 +257,10 @@ struct ethtool_ops { struct ethtool_eeprom *, u8 *); int (*get_eee)(struct net_device *, struct ethtool_eee *); int (*set_eee)(struct net_device *, struct ethtool_eee *); + int (*get_tunable)(struct net_device *, + const struct ethtool_tunable *, void *); + int (*set_tunable)(struct net_device *, + const struct ethtool_tunable *, const void *); }; diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index e3c7a719c76b..7a364f2f3d3f 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -209,6 +209,32 @@ struct ethtool_value { __u32 data; }; +enum tunable_id { + ETHTOOL_ID_UNSPEC, + ETHTOOL_RX_COPYBREAK, +}; + +enum tunable_type_id { + ETHTOOL_TUNABLE_UNSPEC, + ETHTOOL_TUNABLE_U8, + ETHTOOL_TUNABLE_U16, + ETHTOOL_TUNABLE_U32, + ETHTOOL_TUNABLE_U64, + ETHTOOL_TUNABLE_STRING, + ETHTOOL_TUNABLE_S8, + ETHTOOL_TUNABLE_S16, + ETHTOOL_TUNABLE_S32, + ETHTOOL_TUNABLE_S64, +}; + +struct ethtool_tunable { + __u32 cmd; + __u32 id; + __u32 type_id; + __u32 len; + void *data[0]; +}; + /** * struct ethtool_regs - hardware register dump * @cmd: Command number = %ETHTOOL_GREGS @@ -1152,6 +1178,8 @@ enum ethtool_sfeatures_retval_bits { #define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */ #define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */ +#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */ +#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 17cb912793fa..27e61b886520 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1621,6 +1621,80 @@ static int ethtool_get_module_eeprom(struct net_device *dev, modinfo.eeprom_len); } +static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) +{ + switch (tuna->id) { + case ETHTOOL_RX_COPYBREAK: + if (tuna->len != sizeof(u32) || + tuna->type_id != ETHTOOL_TUNABLE_U32) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->get_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + ret = ops->get_tunable(dev, &tuna, data); + if (ret) + goto out; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_to_user(useraddr, data, tuna.len)) + goto out; + ret = 0; + +out: + kfree(data); + return ret; +} + +static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) +{ + int ret; + struct ethtool_tunable tuna; + const struct ethtool_ops *ops = dev->ethtool_ops; + void *data; + + if (!ops->set_tunable) + return -EOPNOTSUPP; + if (copy_from_user(&tuna, useraddr, sizeof(tuna))) + return -EFAULT; + ret = ethtool_tunable_valid(&tuna); + if (ret) + return ret; + data = kmalloc(tuna.len, GFP_USER); + if (!data) + return -ENOMEM; + useraddr += sizeof(tuna); + ret = -EFAULT; + if (copy_from_user(data, useraddr, tuna.len)) + goto out; + ret = ops->set_tunable(dev, &tuna, data); + +out: + kfree(data); + return ret; +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -1670,6 +1744,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: + case ETHTOOL_GTUNABLE: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -1857,6 +1932,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; + case ETHTOOL_GTUNABLE: + rc = ethtool_get_tunable(dev, useraddr); + break; + case ETHTOOL_STUNABLE: + rc = ethtool_set_tunable(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From d4288d3fac18bbc31cb6d369679b1fa1d9321ae9 Mon Sep 17 00:00:00 2001 From: Jurgen Kramer Date: Fri, 5 Sep 2014 10:47:56 +0200 Subject: ALSA: pcm: add new DSD sampleformat for native DSD playback on XMOS based devices XMOS based USB DACs with native DSD support expose this feature via a USB alternate setting. The audio format is either 32-bit raw or a 32-bit PCM format. To utilize this feature on linux this patch introduces a new 32-bit DSD sampleformat DSD_U32_LE. A follow up patch will add a quirk for XMOS based devices to utilize the new format. Further patches will add support to alsa-lib. Signed-off-by: Jurgen Kramer Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 1 + include/uapi/sound/asound.h | 3 ++- sound/core/pcm.c | 1 + sound/core/pcm_misc.c | 4 ++++ 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 67e0bdb9f0fa..e862497f7556 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -183,6 +183,7 @@ struct snd_pcm_ops { #define SNDRV_PCM_FMTBIT_G723_40_1B _SNDRV_PCM_FMTBIT(G723_40_1B) #define SNDRV_PCM_FMTBIT_DSD_U8 _SNDRV_PCM_FMTBIT(DSD_U8) #define SNDRV_PCM_FMTBIT_DSD_U16_LE _SNDRV_PCM_FMTBIT(DSD_U16_LE) +#define SNDRV_PCM_FMTBIT_DSD_U32_LE _SNDRV_PCM_FMTBIT(DSD_U32_LE) #ifdef SNDRV_LITTLE_ENDIAN #define SNDRV_PCM_FMTBIT_S16 SNDRV_PCM_FMTBIT_S16_LE diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 32168f7ffce3..6ee586728df9 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -219,7 +219,8 @@ typedef int __bitwise snd_pcm_format_t; #define SNDRV_PCM_FORMAT_G723_40_1B ((__force snd_pcm_format_t) 47) /* 1 sample in 1 byte */ #define SNDRV_PCM_FORMAT_DSD_U8 ((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */ #define SNDRV_PCM_FORMAT_DSD_U16_LE ((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */ -#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U16_LE +#define SNDRV_PCM_FORMAT_DSD_U32_LE ((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */ +#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_LE #ifdef SNDRV_LITTLE_ENDIAN #define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE diff --git a/sound/core/pcm.c b/sound/core/pcm.c index afccdc553ef9..42ded997b223 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -215,6 +215,7 @@ static char *snd_pcm_format_names[] = { FORMAT(G723_40_1B), FORMAT(DSD_U8), FORMAT(DSD_U16_LE), + FORMAT(DSD_U32_LE), }; const char *snd_pcm_format_name(snd_pcm_format_t format) diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c index 4560ca0e5651..83f54e1dbac9 100644 --- a/sound/core/pcm_misc.c +++ b/sound/core/pcm_misc.c @@ -148,6 +148,10 @@ static struct pcm_format_data pcm_formats[(INT)SNDRV_PCM_FORMAT_LAST+1] = { .width = 16, .phys = 16, .le = 1, .signd = 0, .silence = {}, }, + [SNDRV_PCM_FORMAT_DSD_U32_LE] = { + .width = 32, .phys = 32, .le = 1, .signd = 0, + .silence = { 0x69, 0x69, 0x69, 0x69 }, + }, /* FIXME: the following three formats are not defined properly yet */ [SNDRV_PCM_FORMAT_MPEG] = { .le = -1, .signd = -1, -- cgit v1.2.3 From ff7693d079e58fb62d735b7b8085b53fcfb74528 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Sun, 17 Aug 2014 12:49:49 +0200 Subject: ARM: meson: serial: add MesonX SoC on-chip uart driver The SoC has four fully functional UARTs which use the same programming model. They are named UART_A, UART_B, UART_C and UART_AO (Always-On) which cannot be powered off. Signed-off-by: Carlo Caione Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 18 ++ drivers/tty/serial/Makefile | 1 + drivers/tty/serial/meson_uart.c | 634 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/serial_core.h | 3 + 4 files changed, 656 insertions(+) create mode 100644 drivers/tty/serial/meson_uart.c (limited to 'include/uapi') diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 75cc59f5d253..636949d8144d 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -200,6 +200,24 @@ config SERIAL_KS8695_CONSOLE receives all kernel messages and warnings and which allows logins in single user mode). +config SERIAL_MESON + tristate "Meson serial port support" + depends on ARCH_MESON + select SERIAL_CORE + help + This enables the driver for the on-chip UARTs of the Amlogic + MesonX processors. + +config SERIAL_MESON_CONSOLE + bool "Support for console on meson" + depends on SERIAL_MESON=y + select SERIAL_CORE_CONSOLE + help + Say Y here if you wish to use a Amlogic MesonX UART as the + system console (the system console is the device which + receives all kernel messages and warnings and which allows + logins in single user mode) as /dev/ttyAMLx. + config SERIAL_CLPS711X tristate "CLPS711X serial port support" depends on ARCH_CLPS711X || COMPILE_TEST diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index 0080cc362e09..9a548acf5fdc 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o obj-$(CONFIG_SERIAL_ICOM) += icom.o obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o obj-$(CONFIG_SERIAL_MPSC) += mpsc.o +obj-$(CONFIG_SERIAL_MESON) += meson_uart.o obj-$(CONFIG_SERIAL_SB1250_DUART) += sb1250-duart.o obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o obj-$(CONFIG_SERIAL_SCCNXP) += sccnxp.o diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c new file mode 100644 index 000000000000..15c749753317 --- /dev/null +++ b/drivers/tty/serial/meson_uart.c @@ -0,0 +1,634 @@ +/* + * Based on meson_uart.c, by AMLOGIC, INC. + * + * Copyright (C) 2014 Carlo Caione + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Register offsets */ +#define AML_UART_WFIFO 0x00 +#define AML_UART_RFIFO 0x04 +#define AML_UART_CONTROL 0x08 +#define AML_UART_STATUS 0x0c +#define AML_UART_MISC 0x10 +#define AML_UART_REG5 0x14 + +/* AML_UART_CONTROL bits */ +#define AML_UART_TX_EN BIT(12) +#define AML_UART_RX_EN BIT(13) +#define AML_UART_TX_RST BIT(22) +#define AML_UART_RX_RST BIT(23) +#define AML_UART_CLR_ERR BIT(24) +#define AML_UART_RX_INT_EN BIT(27) +#define AML_UART_TX_INT_EN BIT(28) +#define AML_UART_DATA_LEN_MASK (0x03 << 20) +#define AML_UART_DATA_LEN_8BIT (0x00 << 20) +#define AML_UART_DATA_LEN_7BIT (0x01 << 20) +#define AML_UART_DATA_LEN_6BIT (0x02 << 20) +#define AML_UART_DATA_LEN_5BIT (0x03 << 20) + +/* AML_UART_STATUS bits */ +#define AML_UART_PARITY_ERR BIT(16) +#define AML_UART_FRAME_ERR BIT(17) +#define AML_UART_TX_FIFO_WERR BIT(18) +#define AML_UART_RX_EMPTY BIT(20) +#define AML_UART_TX_FULL BIT(21) +#define AML_UART_TX_EMPTY BIT(22) +#define AML_UART_ERR (AML_UART_PARITY_ERR | \ + AML_UART_FRAME_ERR | \ + AML_UART_TX_FIFO_WERR) + +/* AML_UART_CONTROL bits */ +#define AML_UART_TWO_WIRE_EN BIT(15) +#define AML_UART_PARITY_TYPE BIT(18) +#define AML_UART_PARITY_EN BIT(19) +#define AML_UART_CLEAR_ERR BIT(24) +#define AML_UART_STOP_BIN_LEN_MASK (0x03 << 16) +#define AML_UART_STOP_BIN_1SB (0x00 << 16) +#define AML_UART_STOP_BIN_2SB (0x01 << 16) + +/* AML_UART_MISC bits */ +#define AML_UART_XMIT_IRQ(c) (((c) & 0xff) << 8) +#define AML_UART_RECV_IRQ(c) ((c) & 0xff) + +/* AML_UART_REG5 bits */ +#define AML_UART_BAUD_MASK 0x7fffff +#define AML_UART_BAUD_USE BIT(23) + +#define AML_UART_PORT_NUM 6 +#define AML_UART_DEV_NAME "ttyAML" + + +static struct uart_driver meson_uart_driver; + +static struct uart_port *meson_ports[AML_UART_PORT_NUM]; + +static void meson_uart_set_mctrl(struct uart_port *port, unsigned int mctrl) +{ +} + +static unsigned int meson_uart_get_mctrl(struct uart_port *port) +{ + return TIOCM_CTS; +} + +static unsigned int meson_uart_tx_empty(struct uart_port *port) +{ + u32 val; + + val = readl(port->membase + AML_UART_STATUS); + return (val & AML_UART_TX_EMPTY) ? TIOCSER_TEMT : 0; +} + +static void meson_uart_stop_tx(struct uart_port *port) +{ + u32 val; + + val = readl(port->membase + AML_UART_CONTROL); + val &= ~AML_UART_TX_EN; + writel(val, port->membase + AML_UART_CONTROL); +} + +static void meson_uart_stop_rx(struct uart_port *port) +{ + u32 val; + + val = readl(port->membase + AML_UART_CONTROL); + val &= ~AML_UART_RX_EN; + writel(val, port->membase + AML_UART_CONTROL); +} + +static void meson_uart_shutdown(struct uart_port *port) +{ + unsigned long flags; + u32 val; + + free_irq(port->irq, port); + + spin_lock_irqsave(&port->lock, flags); + + val = readl(port->membase + AML_UART_CONTROL); + val &= ~(AML_UART_RX_EN | AML_UART_TX_EN); + val &= ~(AML_UART_RX_INT_EN | AML_UART_TX_INT_EN); + writel(val, port->membase + AML_UART_CONTROL); + + spin_unlock_irqrestore(&port->lock, flags); +} + +static void meson_uart_start_tx(struct uart_port *port) +{ + struct circ_buf *xmit = &port->state->xmit; + unsigned int ch; + + if (uart_tx_stopped(port)) { + meson_uart_stop_tx(port); + return; + } + + while (!(readl(port->membase + AML_UART_STATUS) & AML_UART_TX_FULL)) { + if (port->x_char) { + writel(port->x_char, port->membase + AML_UART_WFIFO); + port->icount.tx++; + port->x_char = 0; + continue; + } + + if (uart_circ_empty(xmit)) + break; + + ch = xmit->buf[xmit->tail]; + writel(ch, port->membase + AML_UART_WFIFO); + xmit->tail = (xmit->tail+1) & (SERIAL_XMIT_SIZE - 1); + port->icount.tx++; + } + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); +} + +static void meson_receive_chars(struct uart_port *port) +{ + struct tty_port *tport = &port->state->port; + char flag; + u32 status, ch, mode; + + do { + flag = TTY_NORMAL; + port->icount.rx++; + status = readl(port->membase + AML_UART_STATUS); + + if (status & AML_UART_ERR) { + if (status & AML_UART_TX_FIFO_WERR) + port->icount.overrun++; + else if (status & AML_UART_FRAME_ERR) + port->icount.frame++; + else if (status & AML_UART_PARITY_ERR) + port->icount.frame++; + + mode = readl(port->membase + AML_UART_CONTROL); + mode |= AML_UART_CLEAR_ERR; + writel(mode, port->membase + AML_UART_CONTROL); + + /* It doesn't clear to 0 automatically */ + mode &= ~AML_UART_CLEAR_ERR; + writel(mode, port->membase + AML_UART_CONTROL); + + status &= port->read_status_mask; + if (status & AML_UART_FRAME_ERR) + flag = TTY_FRAME; + else if (status & AML_UART_PARITY_ERR) + flag = TTY_PARITY; + } + + ch = readl(port->membase + AML_UART_RFIFO); + ch &= 0xff; + + if ((status & port->ignore_status_mask) == 0) + tty_insert_flip_char(tport, ch, flag); + + if (status & AML_UART_TX_FIFO_WERR) + tty_insert_flip_char(tport, 0, TTY_OVERRUN); + + } while (!(readl(port->membase + AML_UART_STATUS) & AML_UART_RX_EMPTY)); + + spin_unlock(&port->lock); + tty_flip_buffer_push(tport); + spin_lock(&port->lock); +} + +static irqreturn_t meson_uart_interrupt(int irq, void *dev_id) +{ + struct uart_port *port = (struct uart_port *)dev_id; + + spin_lock(&port->lock); + + if (!(readl(port->membase + AML_UART_STATUS) & AML_UART_RX_EMPTY)) + meson_receive_chars(port); + + if (!(readl(port->membase + AML_UART_STATUS) & AML_UART_TX_FULL)) + meson_uart_start_tx(port); + + spin_unlock(&port->lock); + + return IRQ_HANDLED; +} + +static const char *meson_uart_type(struct uart_port *port) +{ + return (port->type == PORT_MESON) ? "meson_uart" : NULL; +} + +static int meson_uart_startup(struct uart_port *port) +{ + u32 val; + int ret = 0; + + val = readl(port->membase + AML_UART_CONTROL); + val |= (AML_UART_RX_RST | AML_UART_TX_RST | AML_UART_CLR_ERR); + writel(val, port->membase + AML_UART_CONTROL); + + val &= ~(AML_UART_RX_RST | AML_UART_TX_RST | AML_UART_CLR_ERR); + writel(val, port->membase + AML_UART_CONTROL); + + val |= (AML_UART_RX_EN | AML_UART_TX_EN); + writel(val, port->membase + AML_UART_CONTROL); + + val |= (AML_UART_RX_INT_EN | AML_UART_TX_INT_EN); + writel(val, port->membase + AML_UART_CONTROL); + + val = (AML_UART_RECV_IRQ(1) | AML_UART_XMIT_IRQ(port->fifosize / 2)); + writel(val, port->membase + AML_UART_MISC); + + ret = request_irq(port->irq, meson_uart_interrupt, 0, + meson_uart_type(port), port); + + return ret; +} + +static void meson_uart_change_speed(struct uart_port *port, unsigned long baud) +{ + u32 val; + + while (!(readl(port->membase + AML_UART_STATUS) & AML_UART_TX_EMPTY)) + cpu_relax(); + + val = readl(port->membase + AML_UART_REG5); + val &= ~AML_UART_BAUD_MASK; + val = ((port->uartclk * 10 / (baud * 4) + 5) / 10) - 1; + val |= AML_UART_BAUD_USE; + writel(val, port->membase + AML_UART_REG5); +} + +static void meson_uart_set_termios(struct uart_port *port, + struct ktermios *termios, + struct ktermios *old) +{ + unsigned int cflags, iflags, baud; + unsigned long flags; + u32 val; + + spin_lock_irqsave(&port->lock, flags); + + cflags = termios->c_cflag; + iflags = termios->c_iflag; + + val = readl(port->membase + AML_UART_CONTROL); + + val &= ~AML_UART_DATA_LEN_MASK; + switch (cflags & CSIZE) { + case CS8: + val |= AML_UART_DATA_LEN_8BIT; + break; + case CS7: + val |= AML_UART_DATA_LEN_7BIT; + break; + case CS6: + val |= AML_UART_DATA_LEN_6BIT; + break; + case CS5: + val |= AML_UART_DATA_LEN_5BIT; + break; + } + + if (cflags & PARENB) + val |= AML_UART_PARITY_EN; + else + val &= ~AML_UART_PARITY_EN; + + if (cflags & PARODD) + val |= AML_UART_PARITY_TYPE; + else + val &= ~AML_UART_PARITY_TYPE; + + val &= ~AML_UART_STOP_BIN_LEN_MASK; + if (cflags & CSTOPB) + val |= AML_UART_STOP_BIN_2SB; + else + val &= ~AML_UART_STOP_BIN_1SB; + + if (cflags & CRTSCTS) + val &= ~AML_UART_TWO_WIRE_EN; + else + val |= AML_UART_TWO_WIRE_EN; + + writel(val, port->membase + AML_UART_CONTROL); + + baud = uart_get_baud_rate(port, termios, old, 9600, 115200); + meson_uart_change_speed(port, baud); + + port->read_status_mask = AML_UART_TX_FIFO_WERR; + if (iflags & INPCK) + port->read_status_mask |= AML_UART_PARITY_ERR | + AML_UART_FRAME_ERR; + + port->ignore_status_mask = 0; + if (iflags & IGNPAR) + port->ignore_status_mask |= AML_UART_PARITY_ERR | + AML_UART_FRAME_ERR; + + uart_update_timeout(port, termios->c_cflag, baud); + spin_unlock_irqrestore(&port->lock, flags); +} + +static int meson_uart_verify_port(struct uart_port *port, + struct serial_struct *ser) +{ + int ret = 0; + + if (port->type != PORT_MESON) + ret = -EINVAL; + if (port->irq != ser->irq) + ret = -EINVAL; + if (ser->baud_base < 9600) + ret = -EINVAL; + return ret; +} + +static void meson_uart_release_port(struct uart_port *port) +{ + if (port->flags & UPF_IOREMAP) { + iounmap(port->membase); + port->membase = NULL; + } +} + +static int meson_uart_request_port(struct uart_port *port) +{ + struct platform_device *pdev = to_platform_device(port->dev); + struct resource *res; + int size; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&pdev->dev, "cannot obtain I/O memory region"); + return -ENODEV; + } + size = resource_size(res); + + if (!devm_request_mem_region(port->dev, port->mapbase, size, + dev_name(port->dev))) { + dev_err(port->dev, "Memory region busy\n"); + return -EBUSY; + } + + if (port->flags & UPF_IOREMAP) { + port->membase = devm_ioremap_nocache(port->dev, + port->mapbase, + size); + if (port->membase == NULL) + return -ENOMEM; + } + + return 0; +} + +static void meson_uart_config_port(struct uart_port *port, int flags) +{ + if (flags & UART_CONFIG_TYPE) { + port->type = PORT_MESON; + meson_uart_request_port(port); + } +} + +static struct uart_ops meson_uart_ops = { + .set_mctrl = meson_uart_set_mctrl, + .get_mctrl = meson_uart_get_mctrl, + .tx_empty = meson_uart_tx_empty, + .start_tx = meson_uart_start_tx, + .stop_tx = meson_uart_stop_tx, + .stop_rx = meson_uart_stop_rx, + .startup = meson_uart_startup, + .shutdown = meson_uart_shutdown, + .set_termios = meson_uart_set_termios, + .type = meson_uart_type, + .config_port = meson_uart_config_port, + .request_port = meson_uart_request_port, + .release_port = meson_uart_release_port, + .verify_port = meson_uart_verify_port, +}; + +#ifdef CONFIG_SERIAL_MESON_CONSOLE + +static void meson_console_putchar(struct uart_port *port, int ch) +{ + if (!port->membase) + return; + + while (readl(port->membase + AML_UART_STATUS) & AML_UART_TX_FULL) + cpu_relax(); + writel(ch, port->membase + AML_UART_WFIFO); +} + +static void meson_serial_console_write(struct console *co, const char *s, + u_int count) +{ + struct uart_port *port; + unsigned long flags; + int locked; + + port = meson_ports[co->index]; + if (!port) + return; + + local_irq_save(flags); + if (port->sysrq) { + locked = 0; + } else if (oops_in_progress) { + locked = spin_trylock(&port->lock); + } else { + spin_lock(&port->lock); + locked = 1; + } + + uart_console_write(port, s, count, meson_console_putchar); + + if (locked) + spin_unlock(&port->lock); + local_irq_restore(flags); +} + +static int meson_serial_console_setup(struct console *co, char *options) +{ + struct uart_port *port; + int baud = 115200; + int bits = 8; + int parity = 'n'; + int flow = 'n'; + + if (co->index < 0 || co->index >= AML_UART_PORT_NUM) + return -EINVAL; + + port = meson_ports[co->index]; + if (!port || !port->membase) + return -ENODEV; + + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + + return uart_set_options(port, co, baud, parity, bits, flow); +} + +static struct console meson_serial_console = { + .name = AML_UART_DEV_NAME, + .write = meson_serial_console_write, + .device = uart_console_device, + .setup = meson_serial_console_setup, + .flags = CON_PRINTBUFFER, + .index = -1, + .data = &meson_uart_driver, +}; + +static int __init meson_serial_console_init(void) +{ + register_console(&meson_serial_console); + return 0; +} +console_initcall(meson_serial_console_init); + +#define MESON_SERIAL_CONSOLE (&meson_serial_console) +#else +#define MESON_SERIAL_CONSOLE NULL +#endif + +static struct uart_driver meson_uart_driver = { + .owner = THIS_MODULE, + .driver_name = "meson_uart", + .dev_name = AML_UART_DEV_NAME, + .nr = AML_UART_PORT_NUM, + .cons = MESON_SERIAL_CONSOLE, +}; + +static int meson_uart_probe(struct platform_device *pdev) +{ + struct resource *res_mem, *res_irq; + struct uart_port *port; + struct clk *clk; + int ret = 0; + + if (pdev->dev.of_node) + pdev->id = of_alias_get_id(pdev->dev.of_node, "serial"); + + if (pdev->id < 0 || pdev->id >= AML_UART_PORT_NUM) + return -EINVAL; + + res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res_mem) + return -ENODEV; + + res_irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (!res_irq) + return -ENODEV; + + if (meson_ports[pdev->id]) { + dev_err(&pdev->dev, "port %d already allocated\n", pdev->id); + return -EBUSY; + } + + port = devm_kzalloc(&pdev->dev, sizeof(struct uart_port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + clk = clk_get(&pdev->dev, NULL); + if (IS_ERR(clk)) + return PTR_ERR(clk); + + port->uartclk = clk_get_rate(clk); + port->iotype = UPIO_MEM; + port->mapbase = res_mem->start; + port->irq = res_irq->start; + port->flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP | UPF_LOW_LATENCY; + port->dev = &pdev->dev; + port->line = pdev->id; + port->type = PORT_MESON; + port->x_char = 0; + port->ops = &meson_uart_ops; + port->fifosize = 64; + + meson_ports[pdev->id] = port; + platform_set_drvdata(pdev, port); + + ret = uart_add_one_port(&meson_uart_driver, port); + if (ret) + meson_ports[pdev->id] = NULL; + + return ret; +} + +static int meson_uart_remove(struct platform_device *pdev) +{ + struct uart_port *port; + + port = platform_get_drvdata(pdev); + uart_remove_one_port(&meson_uart_driver, port); + meson_ports[pdev->id] = NULL; + + return 0; +} + + +static const struct of_device_id meson_uart_dt_match[] = { + { .compatible = "amlogic,meson-uart" }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, meson_uart_dt_match); + +static struct platform_driver meson_uart_platform_driver = { + .probe = meson_uart_probe, + .remove = meson_uart_remove, + .driver = { + .owner = THIS_MODULE, + .name = "meson_uart", + .of_match_table = meson_uart_dt_match, + }, +}; + +static int __init meson_uart_init(void) +{ + int ret; + + ret = uart_register_driver(&meson_uart_driver); + if (ret) + return ret; + + ret = platform_driver_register(&meson_uart_platform_driver); + if (ret) + uart_unregister_driver(&meson_uart_driver); + + return ret; +} + +static void __exit meson_uart_exit(void) +{ + platform_driver_unregister(&meson_uart_platform_driver); + uart_unregister_driver(&meson_uart_driver); +} + +module_init(meson_uart_init); +module_exit(meson_uart_exit); + +MODULE_AUTHOR("Carlo Caione "); +MODULE_DESCRIPTION("Amlogic Meson serial port driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 5820269aa132..16ad8521af6a 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -244,4 +244,7 @@ /* SC16IS74xx */ #define PORT_SC16IS7XX 108 +/* MESON */ +#define PORT_MESON 109 + #endif /* _UAPILINUX_SERIAL_CORE_H */ -- cgit v1.2.3 From f3dbd802b3caf8da92173870bc270dda6b3f84ba Mon Sep 17 00:00:00 2001 From: Rajat Jain Date: Tue, 2 Sep 2014 16:26:00 -0700 Subject: PCI: Enable CRS Software Visibility for root port if it is supported Per PCIe r3.0, sec 2.3.2, an endpoint may respond to a Configuration Request with a Completion with Configuration Request Retry Status (CRS). This terminates the Configuration Request. When the CRS Software Visibility feature is disabled (as it is by default), a Root Complex must handle a CRS Completion by re-issuing the Configuration Request. This is invisible to software. From the CPU's point of view, an endpoint that always responds with CRS causes a hang because the Root Complex never supplies data to complete the CPU read. When CRS Software Visibility is enabled, a Root Complex that receives a CRS Completion for a read of the Vendor ID must return data of 0x0001. The Vendor ID of 0x0001 indicates to software that the endpoint is not ready. We now have more devices that require CRS Software Visibility. For example, a PLX 8713 NT bridge may respond with CRS until it has been configured via I2C, and the I2C configuration is completely independent of PCI enumeration. Enable CRS Software Visibility if it is supported. This allows a system with such a device to work (though the PCI core times out waiting for it to become ready, and we have to rescan the bus after it is ready). This essentially reverts ad7edfe04908 ("[PCI] Do not enable CRS Software Visibility by default"). The failures that led to ad7edfe04908 should be addressed by 89665a6a7140 ("PCI: Check only the Vendor ID to identify Configuration Request Retry"). [bhelgaas: changelog] Link: http://lkml.kernel.org/r/20071029061532.5d10dfc6@snowcone Link: http://lkml.kernel.org/r/alpine.LFD.0.9999.0712271023090.21557@woody.linux-foundation.org Signed-off-by: Rajat Jain Signed-off-by: Rajat Jain Signed-off-by: Guenter Roeck Signed-off-by: Bjorn Helgaas --- drivers/pci/probe.c | 13 +++++++++++++ include/uapi/linux/pci_regs.h | 1 + 2 files changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 9c26663c91d2..e02cdaa5bf0c 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -740,6 +740,17 @@ struct pci_bus *pci_add_new_bus(struct pci_bus *parent, struct pci_dev *dev, } EXPORT_SYMBOL(pci_add_new_bus); +static void pci_enable_crs(struct pci_dev *pdev) +{ + u16 root_cap = 0; + + /* Enable CRS Software Visibility if supported */ + pcie_capability_read_word(pdev, PCI_EXP_RTCAP, &root_cap); + if (root_cap & PCI_EXP_RTCAP_CRSVIS) + pcie_capability_set_word(pdev, PCI_EXP_RTCTL, + PCI_EXP_RTCTL_CRSSVE); +} + /* * If it's a bridge, configure it and scan the bus behind it. * For CardBus bridges, we don't scan behind as the devices will @@ -787,6 +798,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass) pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT); + pci_enable_crs(dev); + if ((secondary || subordinate) && !pcibios_assign_all_busses() && !is_cardbus && !broken) { unsigned int cmax; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 30db069bce62..57d6e56a086a 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -552,6 +552,7 @@ #define PCI_EXP_RTCTL_PMEIE 0x0008 /* PME Interrupt Enable */ #define PCI_EXP_RTCTL_CRSSVE 0x0010 /* CRS Software Visibility Enable */ #define PCI_EXP_RTCAP 30 /* Root Capabilities */ +#define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */ #define PCI_EXP_RTSTA 32 /* Root Status */ #define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */ #define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */ -- cgit v1.2.3 From 3045d76070abe725dbb7fd8ff39c27b820d5a7eb Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Tue, 2 Sep 2014 20:36:14 +0200 Subject: netfilter: nf_tables: add devgroup support in meta expresion Add devgroup support to let us match device group of a packets incoming or outgoing interface. Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ net/netfilter/nft_meta.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c9b6f00a3fb7..c000947d3f38 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -573,6 +573,8 @@ enum nft_exthdr_attributes { * @NFT_META_BRI_OIFNAME: packet output bridge interface name * @NFT_META_PKTTYPE: packet type (skb->pkt_type), special handling for loopback * @NFT_META_CPU: cpu id through smp_processor_id() + * @NFT_META_IIFGROUP: packet input interface group + * @NFT_META_OIFGROUP: packet output interface group */ enum nft_meta_keys { NFT_META_LEN, @@ -596,6 +598,8 @@ enum nft_meta_keys { NFT_META_BRI_OIFNAME, NFT_META_PKTTYPE, NFT_META_CPU, + NFT_META_IIFGROUP, + NFT_META_OIFGROUP, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 843e099a962d..1e7c076ca63a 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -155,6 +155,16 @@ void nft_meta_get_eval(const struct nft_expr *expr, case NFT_META_CPU: dest->data[0] = smp_processor_id(); break; + case NFT_META_IIFGROUP: + if (in == NULL) + goto err; + dest->data[0] = in->group; + break; + case NFT_META_OIFGROUP: + if (out == NULL) + goto err; + dest->data[0] = out->group; + break; default: WARN_ON(1); goto err; @@ -228,6 +238,8 @@ int nft_meta_get_init(const struct nft_ctx *ctx, #endif case NFT_META_PKTTYPE: case NFT_META_CPU: + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From e42eff8a32f8b7bde88ea3c5a56391407cbe84f3 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Thu, 4 Sep 2014 14:06:14 +0200 Subject: netfilter: nft_nat: include a flag attribute Both SNAT and DNAT (and the upcoming masquerade) can have additional configuration parameters, such as port randomization and NAT addressing persistence. We can cover these scenarios by simply adding a flag attribute for userspace to fill when needed. The flags to use are defined in include/uapi/linux/netfilter/nf_nat.h: NF_NAT_RANGE_MAP_IPS NF_NAT_RANGE_PROTO_SPECIFIED NF_NAT_RANGE_PROTO_RANDOM NF_NAT_RANGE_PERSISTENT NF_NAT_RANGE_PROTO_RANDOM_FULLY NF_NAT_RANGE_PROTO_RANDOM_ALL The caller must take care of not messing up with the flags, as they are added unconditionally to the final resulting nf_nat_range. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_nat.h | 5 +++++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_nat.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_nat.h b/include/uapi/linux/netfilter/nf_nat.h index 1ad3659102b6..0880781ad7b6 100644 --- a/include/uapi/linux/netfilter/nf_nat.h +++ b/include/uapi/linux/netfilter/nf_nat.h @@ -13,6 +13,11 @@ #define NF_NAT_RANGE_PROTO_RANDOM_ALL \ (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY) +#define NF_NAT_RANGE_MASK \ + (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED | \ + NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT | \ + NF_NAT_RANGE_PROTO_RANDOM_FULLY) + struct nf_nat_ipv4_range { unsigned int flags; __be32 min_ip; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c000947d3f38..6022c6e6be18 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -785,6 +785,7 @@ enum nft_nat_types { * @NFTA_NAT_REG_ADDR_MAX: source register of address range end (NLA_U32: nft_registers) * @NFTA_NAT_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) * @NFTA_NAT_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_NAT_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) */ enum nft_nat_attributes { NFTA_NAT_UNSPEC, @@ -794,6 +795,7 @@ enum nft_nat_attributes { NFTA_NAT_REG_ADDR_MAX, NFTA_NAT_REG_PROTO_MIN, NFTA_NAT_REG_PROTO_MAX, + NFTA_NAT_FLAGS, __NFTA_NAT_MAX }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 79ff58cd36dc..799550b476fb 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -33,6 +33,7 @@ struct nft_nat { enum nft_registers sreg_proto_max:8; enum nf_nat_manip_type type:8; u8 family; + u16 flags; }; static void nft_nat_eval(const struct nft_expr *expr, @@ -71,6 +72,8 @@ static void nft_nat_eval(const struct nft_expr *expr, range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } + range.flags |= priv->flags; + data[NFT_REG_VERDICT].verdict = nf_nat_setup_info(ct, &range, priv->type); } @@ -82,6 +85,7 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_REG_ADDR_MAX] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MIN] = { .type = NLA_U32 }, [NFTA_NAT_REG_PROTO_MAX] = { .type = NLA_U32 }, + [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, }; static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, @@ -149,6 +153,12 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else priv->sreg_proto_max = priv->sreg_proto_min; + if (tb[NFTA_NAT_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + return 0; } @@ -183,6 +193,12 @@ static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) htonl(priv->sreg_proto_max))) goto nla_put_failure; } + + if (priv->flags != 0) { + if (nla_put_be32(skb, NFTA_NAT_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + } + return 0; nla_put_failure: -- cgit v1.2.3 From 9ba1f726bec090399eb9bb9157eb32dedc8e8c45 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Mon, 8 Sep 2014 13:45:00 +0200 Subject: netfilter: nf_tables: add new nft_masq expression The nft_masq expression is intended to perform NAT in the masquerade flavour. We decided to have the masquerade functionality in a separated expression other than nft_nat. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_masq.h | 16 ++++++ include/uapi/linux/netfilter/nf_tables.h | 11 ++++ net/ipv4/netfilter/Kconfig | 6 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_masq_ipv4.c | 89 ++++++++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 +++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_masq_ipv6.c | 89 ++++++++++++++++++++++++++++++++ net/netfilter/Kconfig | 9 ++++ net/netfilter/Makefile | 1 + net/netfilter/nft_masq.c | 59 +++++++++++++++++++++ 11 files changed, 288 insertions(+) create mode 100644 include/net/netfilter/nft_masq.h create mode 100644 net/ipv4/netfilter/nft_masq_ipv4.c create mode 100644 net/ipv6/netfilter/nft_masq_ipv6.c create mode 100644 net/netfilter/nft_masq.c (limited to 'include/uapi') diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h new file mode 100644 index 000000000000..c72729f954f4 --- /dev/null +++ b/include/net/netfilter/nft_masq.h @@ -0,0 +1,16 @@ +#ifndef _NFT_MASQ_H_ +#define _NFT_MASQ_H_ + +struct nft_masq { + u32 flags; +}; + +extern const struct nla_policy nft_masq_policy[]; + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr); + +#endif /* _NFT_MASQ_H_ */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6022c6e6be18..eeec0ae845ef 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -800,4 +800,15 @@ enum nft_nat_attributes { }; #define NFTA_NAT_MAX (__NFTA_NAT_MAX - 1) +/** + * enum nft_masq_attributes - nf_tables masquerade expression attributes + * + * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_masq_attributes { + NFTA_MASQ_FLAGS, + __NFTA_MASQ_MAX +}; +#define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 4be3e541350e..8dd3d9f19d82 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -190,6 +190,12 @@ config NF_NAT_MASQUERADE_IPV4 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection). +config NFT_MASQ_IPV4 + tristate "IPv4 masquerading support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV4 + config IP_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV4 diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 42056b2fd0e3..7d019aefb0ed 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o +obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c new file mode 100644 index 000000000000..6ea1d207b6a5 --- /dev/null +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_masq_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum, + &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static int nft_masq_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + int err; + + err = nft_masq_init(ctx, expr, tb); + if (err < 0) + return err; + + nf_nat_masquerade_ipv4_register_notifier(); + return 0; +} + +static void nft_masq_ipv4_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_nat_masquerade_ipv4_unregister_notifier(); +} + +static struct nft_expr_type nft_masq_ipv4_type; +static const struct nft_expr_ops nft_masq_ipv4_ops = { + .type = &nft_masq_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv4_eval, + .init = nft_masq_ipv4_init, + .destroy = nft_masq_ipv4_destroy, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "masq", + .ops = &nft_masq_ipv4_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv4_module_init(void) +{ + return nft_register_expr(&nft_masq_ipv4_type); +} + +static void __exit nft_masq_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv4_type); +} + +module_init(nft_masq_ipv4_module_init); +module_exit(nft_masq_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 6c8cfec6836a..24c535f66df0 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -252,6 +252,12 @@ config NF_NAT_MASQUERADE_IPV6 This is the kernel functionality to provide NAT in the masquerade flavour (automatic source address selection) for IPv6. +config NFT_MASQ_IPV6 + tristate "IPv6 masquerade support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_MASQ + select NF_NAT_MASQUERADE_IPV6 + config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NF_NAT_MASQUERADE_IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 89a0bd751f82..482c4dff273f 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o +obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c new file mode 100644 index 000000000000..4e51334ef6b7 --- /dev/null +++ b/net/ipv6/netfilter/nft_masq_ipv6.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_masq_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_masq *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + range.flags = priv->flags; + + verdict = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out); + + data[NFT_REG_VERDICT].verdict = verdict; +} + +static int nft_masq_ipv6_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + int err; + + err = nft_masq_init(ctx, expr, tb); + if (err < 0) + return err; + + nf_nat_masquerade_ipv6_register_notifier(); + return 0; +} + +static void nft_masq_ipv6_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + nf_nat_masquerade_ipv6_unregister_notifier(); +} + +static struct nft_expr_type nft_masq_ipv6_type; +static const struct nft_expr_ops nft_masq_ipv6_ops = { + .type = &nft_masq_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)), + .eval = nft_masq_ipv6_eval, + .init = nft_masq_ipv6_init, + .destroy = nft_masq_ipv6_destroy, + .dump = nft_masq_dump, +}; + +static struct nft_expr_type nft_masq_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "masq", + .ops = &nft_masq_ipv6_ops, + .policy = nft_masq_policy, + .maxattr = NFTA_MASQ_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_masq_ipv6_module_init(void) +{ + return nft_register_expr(&nft_masq_ipv6_type); +} + +static void __exit nft_masq_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_masq_ipv6_type); +} + +module_init(nft_masq_ipv6_module_init); +module_exit(nft_masq_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ad751fe2e82b..37428723394f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -496,6 +496,15 @@ config NFT_LIMIT This option adds the "limit" expression that you can use to ratelimit rule matchings. +config NFT_MASQ + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables masquerade support" + help + This option adds the "masquerade" expression that you can use + to perform NAT in the masquerade flavour. + config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 8308624a406a..0637792f6faf 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_NFT_RBTREE) += nft_rbtree.o obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o +obj-$(CONFIG_NFT_MASQ) += nft_masq.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c new file mode 100644 index 000000000000..6637bab00567 --- /dev/null +++ b/net/netfilter/nft_masq.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { + [NFTA_MASQ_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_masq_policy); + +int nft_masq_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_masq *priv = nft_expr_priv(expr); + + if (tb[NFTA_MASQ_FLAGS] == NULL) + return 0; + + priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(nft_masq_init); + +int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_masq *priv = nft_expr_priv(expr); + + if (priv->flags == 0) + return 0; + + if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_masq_dump); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); -- cgit v1.2.3 From c559a353410939c0884e83bdb0e2420a986ac53b Mon Sep 17 00:00:00 2001 From: Robert Baldyga Date: Tue, 9 Sep 2014 08:23:16 +0200 Subject: usb: gadget: f_fs: add ioctl returning ep descriptor This patch introduces ioctl named FUNCTIONFS_ENDPOINT_DESC, which returns endpoint descriptor to userspace. It works only if function is active. Signed-off-by: Robert Baldyga Acked-by: Michal Nazarewicz Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 23 +++++++++++++++++++++++ include/uapi/linux/usb/functionfs.h | 6 ++++++ 2 files changed, 29 insertions(+) (limited to 'include/uapi') diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 1aad353c1f11..a345385a5abe 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1026,6 +1026,29 @@ static long ffs_epfile_ioctl(struct file *file, unsigned code, case FUNCTIONFS_ENDPOINT_REVMAP: ret = epfile->ep->num; break; + case FUNCTIONFS_ENDPOINT_DESC: + { + int desc_idx; + struct usb_endpoint_descriptor *desc; + + switch (epfile->ffs->gadget->speed) { + case USB_SPEED_SUPER: + desc_idx = 2; + break; + case USB_SPEED_HIGH: + desc_idx = 1; + break; + default: + desc_idx = 0; + } + desc = epfile->ep->descs[desc_idx]; + + spin_unlock_irq(&epfile->ffs->eps_lock); + ret = copy_to_user((void *)value, desc, sizeof(*desc)); + if (ret) + ret = -EFAULT; + return ret; + } default: ret = -ENOTTY; } diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h index 6d2a16b834bf..7c7a2feb0e6e 100644 --- a/include/uapi/linux/usb/functionfs.h +++ b/include/uapi/linux/usb/functionfs.h @@ -275,6 +275,12 @@ struct usb_functionfs_event { */ #define FUNCTIONFS_ENDPOINT_REVMAP _IO('g', 129) +/* + * Returns endpoint descriptor. If function is not active returns -ENODEV. + */ +#define FUNCTIONFS_ENDPOINT_DESC _IOR('g', 130, \ + struct usb_endpoint_descriptor) + #endif /* _UAPI__LINUX_FUNCTIONFS_H__ */ -- cgit v1.2.3 From daedfb22451dd02b35c0549566cbb7cc06bdd53b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 4 Sep 2014 22:17:18 -0700 Subject: net: filter: split filter.h and expose eBPF to user space allow user space to generate eBPF programs uapi/linux/bpf.h: eBPF instruction set definition linux/filter.h: the rest This patch only moves macro definitions, but practically it freezes existing eBPF instruction set, though new instructions can still be added in the future. These eBPF definitions cannot go into uapi/linux/filter.h, since the names may conflict with existing applications. Full eBPF ISA description is in Documentation/networking/filter.txt Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/filter.h | 56 +--------------------------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 55 deletions(-) create mode 100644 include/uapi/linux/bpf.h (limited to 'include/uapi') diff --git a/include/linux/filter.h b/include/linux/filter.h index bf323da77950..8f82ef3f1cdd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -10,58 +10,12 @@ #include #include #include +#include struct sk_buff; struct sock; struct seccomp_data; -/* Internally used and optimized filter representation with extended - * instruction set based on top of classic BPF. - */ - -/* instruction classes */ -#define BPF_ALU64 0x07 /* alu mode in double word width */ - -/* ld/ldx fields */ -#define BPF_DW 0x18 /* double word */ -#define BPF_XADD 0xc0 /* exclusive add */ - -/* alu/jmp fields */ -#define BPF_MOV 0xb0 /* mov reg to reg */ -#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ - -/* change endianness of a register */ -#define BPF_END 0xd0 /* flags for endianness conversion: */ -#define BPF_TO_LE 0x00 /* convert to little-endian */ -#define BPF_TO_BE 0x08 /* convert to big-endian */ -#define BPF_FROM_LE BPF_TO_LE -#define BPF_FROM_BE BPF_TO_BE - -#define BPF_JNE 0x50 /* jump != */ -#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ -#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ -#define BPF_CALL 0x80 /* function call */ -#define BPF_EXIT 0x90 /* function return */ - -/* Register numbers */ -enum { - BPF_REG_0 = 0, - BPF_REG_1, - BPF_REG_2, - BPF_REG_3, - BPF_REG_4, - BPF_REG_5, - BPF_REG_6, - BPF_REG_7, - BPF_REG_8, - BPF_REG_9, - BPF_REG_10, - __MAX_BPF_REG, -}; - -/* BPF has 10 general purpose 64-bit registers and stack frame. */ -#define MAX_BPF_REG __MAX_BPF_REG - /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. @@ -322,14 +276,6 @@ enum { #define SK_RUN_FILTER(filter, ctx) \ (*filter->prog->bpf_func)(ctx, filter->prog->insnsi) -struct bpf_insn { - __u8 code; /* opcode */ - __u8 dst_reg:4; /* dest register */ - __u8 src_reg:4; /* source register */ - __s16 off; /* signed offset */ - __s32 imm; /* signed immediate constant */ -}; - #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 24e9033f8b3f..fb3f7b675229 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -67,6 +67,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += bpf.h header-y += bpqether.h header-y += bsg.h header-y += btrfs.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h new file mode 100644 index 000000000000..479ed0b6be16 --- /dev/null +++ b/include/uapi/linux/bpf.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word */ +#define BPF_XADD 0xc0 /* exclusive add */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +#endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From e5c3ea5c668033b303e7ac835d7d91da32d97958 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 5 Sep 2014 15:51:31 +0200 Subject: bridge: implement rtnl_link_ops->get_size and rtnl_link_ops->fill_info Allow rtnetlink users to get bridge master info in IFLA_INFO_DATA attr This initial part implements forward_delay, hello_time, max_age options. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 12 ++++++++++++ net/bridge/br_netlink.c | 25 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ff957604a721..c80f95f6ee78 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -215,6 +215,18 @@ enum in6_addr_gen_mode { IN6_ADDR_GEN_MODE_NONE, }; +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05aeea1222a7..bcac09fe2ac8 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -484,6 +484,29 @@ static size_t br_port_get_slave_size(const struct net_device *brdev, return br_port_info_size(); } +static size_t br_get_size(const struct net_device *brdev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_BR_FORWARD_DELAY */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_HELLO_TIME */ + nla_total_size(sizeof(u32)) + /* IFLA_BR_MAX_AGE */ + 0; +} + +static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) +{ + struct net_bridge *br = netdev_priv(brdev); + u32 forward_delay = jiffies_to_clock_t(br->forward_delay); + u32 hello_time = jiffies_to_clock_t(br->hello_time); + u32 age_time = jiffies_to_clock_t(br->max_age); + + if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || + nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || + nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time)) + return -EMSGSIZE; + + return 0; +} + static size_t br_get_link_af_size(const struct net_device *dev) { struct net_port_vlans *pv; @@ -514,6 +537,8 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .validate = br_validate, .newlink = br_dev_newlink, .dellink = br_dev_delete, + .get_size = br_get_size, + .fill_info = br_fill_info, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, -- cgit v1.2.3 From c858403943886a92eece9d0413aa65c48bbe6fa7 Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Thu, 28 Aug 2014 15:56:00 +0900 Subject: drm/radeon: Add RADEON_GEM_CPU_ACCESS BO creation flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This flag is a hint that userspace expects the BO to be accessed by the CPU. We can use that hint to prevent such BOs from ever being stored in the CPU inaccessible part of VRAM. Signed-off-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_object.c | 11 +++++++++-- include/uapi/drm/radeon_drm.h | 2 ++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index aadbd36e64b9..eef60aaf4e64 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -144,7 +144,12 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) for (i = 0; i < c; ++i) { rbo->placements[i].fpfn = 0; - rbo->placements[i].lpfn = 0; + if ((rbo->flags & RADEON_GEM_CPU_ACCESS) && + (rbo->placements[i].flags & TTM_PL_FLAG_VRAM)) + rbo->placements[i].lpfn = + rbo->rdev->mc.visible_vram_size >> PAGE_SHIFT; + else + rbo->placements[i].lpfn = 0; } /* @@ -152,7 +157,9 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) * improve fragmentation quality. * 512kb was measured as the most optimal number. */ - if (rbo->tbo.mem.size > 512 * 1024) { + if (!((rbo->flags & RADEON_GEM_CPU_ACCESS) && + (rbo->placements[i].flags & TTM_PL_FLAG_VRAM)) && + rbo->tbo.mem.size > 512 * 1024) { for (i = 0; i < c; i++) { rbo->placements[i].flags |= TTM_PL_FLAG_TOPDOWN; } diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 375b6e656c54..f755f20d2b5c 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -801,6 +801,8 @@ struct drm_radeon_gem_info { #define RADEON_GEM_NO_BACKING_STORE (1 << 0) #define RADEON_GEM_GTT_UC (1 << 1) #define RADEON_GEM_GTT_WC (1 << 2) +/* BO is expected to be accessed by the CPU */ +#define RADEON_GEM_CPU_ACCESS (1 << 3) struct drm_radeon_gem_create { uint64_t size; -- cgit v1.2.3 From f266f04d33e5265e2f61ffc9d2b2f97214804995 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 28 Aug 2014 10:59:05 -0400 Subject: drm/radeon: add RADEON_GEM_NO_CPU_ACCESS BO creation flag (v4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows pinning of buffers in the non-CPU visible portion of vram. v2: incorporate Michel's comments. v3: rebase on Michel's patch v4: rebase on Michel's v2 patch Signed-off-by: Alex Deucher Reviewed-by: Michel Dänzer --- drivers/gpu/drm/radeon/radeon_object.c | 1 + include/uapi/drm/radeon_drm.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 3dbbd65336d5..8abee5fa93bd 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -313,6 +313,7 @@ int radeon_bo_pin_restricted(struct radeon_bo *bo, u32 domain, u64 max_offset, for (i = 0; i < bo->placement.num_placement; i++) { /* force to pin into visible video ram */ if ((bo->placements[i].flags & TTM_PL_FLAG_VRAM) && + !(bo->flags & RADEON_GEM_NO_CPU_ACCESS) && (!max_offset || max_offset > bo->rdev->mc.visible_vram_size)) bo->placements[i].lpfn = bo->rdev->mc.visible_vram_size >> PAGE_SHIFT; diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index f755f20d2b5c..50d0fb41a3bf 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -803,6 +803,8 @@ struct drm_radeon_gem_info { #define RADEON_GEM_GTT_WC (1 << 2) /* BO is expected to be accessed by the CPU */ #define RADEON_GEM_CPU_ACCESS (1 << 3) +/* CPU access is not expected to work for this BO */ +#define RADEON_GEM_NO_CPU_ACCESS (1 << 4) struct drm_radeon_gem_create { uint64_t size; -- cgit v1.2.3 From 960d01acf62747d6518694f92be5b06f67473833 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 9 Sep 2014 22:55:35 +0300 Subject: cfg80211: add WMM traffic stream API Add nl80211 and driver API to validate, add and delete traffic streams with appropriate settings. The API calls for userspace doing the action frame handshake with the peer, and then allows only to set up the parameters in the driver. To avoid setting up a session only to tear it down again, the validate API is provided, but the real usage later can still fail so userspace must be prepared for that. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++ include/net/cfg80211.h | 23 ++++++++- include/uapi/linux/nl80211.h | 28 +++++++++++ net/wireless/nl80211.c | 109 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 31 ++++++++++++ net/wireless/trace.h | 45 ++++++++++++++++++ 6 files changed, 239 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 61a09f0644aa..b1be39c76931 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -166,8 +166,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_MESH_ID_LEN 32 +#define IEEE80211_FIRST_TSPEC_TSID 8 #define IEEE80211_NUM_TIDS 16 +/* number of user priorities 802.11 uses */ +#define IEEE80211_NUM_UPS 8 + #define IEEE80211_QOS_CTL_LEN 2 /* 1d tag mask */ #define IEEE80211_QOS_CTL_TAG1D_MASK 0x0007 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c2c710c14a50..a13beab123dc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2318,6 +2318,17 @@ struct cfg80211_qos_map { * @set_ap_chanwidth: Set the AP (including P2P GO) mode channel width for the * given interface This is used e.g. for dynamic HT 20/40 MHz channel width * changes during the lifetime of the BSS. + * + * @add_tx_ts: validate (if admitted_time is 0) or add a TX TS to the device + * with the given parameters; action frame exchange has been handled by + * userspace so this just has to modify the TX path to take the TS into + * account. + * If the admitted time is 0 just validate the parameters to make sure + * the session can be created at all; it is valid to just always return + * success for that but that may result in inefficient behaviour (handshake + * with the peer followed by immediate teardown when the addition is later + * rejected) + * @del_tx_ts: remove an existing TX TS */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2558,6 +2569,12 @@ struct cfg80211_ops { int (*set_ap_chanwidth)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_chan_def *chandef); + + int (*add_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer, u8 user_prio, + u16 admitted_time); + int (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev, + u8 tsid, const u8 *peer); }; /* @@ -2604,9 +2621,13 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). + * @WIPHY_FLAG_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM + * TSPEC sessions (TID aka TSID 0-7) with the NL80211_CMD_ADD_TX_TS + * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it + * needs to be able to handle Block-Ack agreements and other things. */ enum wiphy_flags { - /* use hole at 0 */ + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION = BIT(0), /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 29c4399e08b9..e5b8caf5e466 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -722,6 +722,22 @@ * QoS mapping is relevant for IP packets, it is only valid during an * association. This is cleared on disassociation and AP restart. * + * @NL80211_CMD_ADD_TX_TS: Ask the kernel to add a traffic stream for the given + * %NL80211_ATTR_TSID and %NL80211_ATTR_MAC with %NL80211_ATTR_USER_PRIO + * and %NL80211_ATTR_ADMITTED_TIME parameters. + * Note that the action frame handshake with the AP shall be handled by + * userspace via the normal management RX/TX framework, this only sets + * up the TX TS in the driver/device. + * If the admitted time attribute is not added then the request just checks + * if a subsequent setup could be successful, the intent is to use this to + * avoid setting up a session with the AP when local restrictions would + * make that impossible. However, the subsequent "real" setup may still + * fail even if the check was successful. + * @NL80211_CMD_DEL_TX_TS: Remove an existing TS with the %NL80211_ATTR_TSID + * and %NL80211_ATTR_MAC parameters. It isn't necessary to call this + * before removing a station entry entirely, or before disassociating + * or similar, cleanup will happen in the driver/device in this case. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -893,6 +909,9 @@ enum nl80211_commands { NL80211_CMD_SET_QOS_MAP, + NL80211_CMD_ADD_TX_TS, + NL80211_CMD_DEL_TX_TS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1611,6 +1630,11 @@ enum nl80211_commands { * drivers to indicate dynack capability. Dynack is automatically disabled * setting valid value for coverage class. * + * @NL80211_ATTR_TSID: a TSID value (u8 attribute) + * @NL80211_ATTR_USER_PRIO: user priority value (u8 attribute) + * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds + * (per second) (u16 attribute) + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1957,6 +1981,10 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_DYN_ACK, + NL80211_ATTR_TSID, + NL80211_ATTR_USER_PRIO, + NL80211_ATTR_ADMITTED_TIME, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e9fbd4f4ddb0..ab7ee4893e40 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -391,6 +391,9 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, + [NL80211_ATTR_TSID] = { .type = NLA_U8 }, + [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, + [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1510,6 +1513,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); CMD(set_qos_map, SET_QOS_MAP); + if (rdev->wiphy.flags & + WIPHY_FLAG_SUPPORTS_WMM_ADMISSION) + CMD(add_tx_ts, ADD_TX_TS); } /* add into the if now */ #undef CMD @@ -9390,6 +9396,93 @@ static int nl80211_set_qos_map(struct sk_buff *skb, return ret; } +static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid, up; + u16 admitted_time = 0; + int err; + + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_USER_PRIO]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + if (tsid >= IEEE80211_NUM_TIDS) + return -EINVAL; + + up = nla_get_u8(info->attrs[NL80211_ATTR_USER_PRIO]); + if (up >= IEEE80211_NUM_UPS) + return -EINVAL; + + /* WMM uses TIDs 0-7 even for TSPEC */ + if (tsid < IEEE80211_FIRST_TSPEC_TSID) { + if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + return -EINVAL; + } else { + /* TODO: handle 802.11 TSPEC/admission control + * need more attributes for that (e.g. BA session requirement) + */ + return -EINVAL; + } + + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_ADMITTED_TIME]) { + admitted_time = + nla_get_u16(info->attrs[NL80211_ATTR_ADMITTED_TIME]); + if (!admitted_time) + return -EINVAL; + } + + wdev_lock(wdev); + switch (wdev->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (wdev->current_bss) + break; + err = -ENOTCONN; + goto out; + default: + err = -EOPNOTSUPP; + goto out; + } + + err = rdev_add_tx_ts(rdev, dev, tsid, peer, up, admitted_time); + + out: + wdev_unlock(wdev); + return err; +} + +static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *peer; + u8 tsid; + int err; + + if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + tsid = nla_get_u8(info->attrs[NL80211_ATTR_TSID]); + peer = nla_data(info->attrs[NL80211_ATTR_MAC]); + + wdev_lock(wdev); + err = rdev_del_tx_ts(rdev, dev, tsid, peer); + wdev_unlock(wdev); + + return err; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -10147,6 +10240,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_ADD_TX_TS, + .doit = nl80211_add_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_DEL_TX_TS, + .doit = nl80211_del_tx_ts, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 56c2240c30ce..f6d457d6a558 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -915,4 +915,35 @@ rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_add_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer, + u8 user_prio, u16 admitted_time) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + if (rdev->ops->add_tx_ts) + ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, + user_prio, admitted_time); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_del_tx_ts(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 tsid, const u8 *peer) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); + if (rdev->ops->del_tx_ts) + ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 0c524cd76c83..625a6e6d1168 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1896,6 +1896,51 @@ TRACE_EVENT(rdev_set_ap_chanwidth, WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(rdev_add_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time), + TP_ARGS(wiphy, netdev, tsid, peer, user_prio, admitted_time), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + __field(u8, user_prio) + __field(u16, admitted_time) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + __entry->user_prio = user_prio; + __entry->admitted_time = admitted_time; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d, UP %d, time %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), + __entry->tsid, __entry->user_prio, __entry->admitted_time) +); + +TRACE_EVENT(rdev_del_tx_ts, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 tsid, const u8 *peer), + TP_ARGS(wiphy, netdev, tsid, peer), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(peer) + __field(u8, tsid) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(peer, peer); + __entry->tsid = tsid; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ", TSID %d", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3 From 18998c381b19bfc3c285361ff6200ded7444aa2c Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Sep 2014 14:07:34 +0300 Subject: cfg80211: allow requesting SMPS mode on ap start Add feature bits to indicate device support for static-smps and dynamic-smps modes. Add a new NL80211_ATTR_SMPS_MODE attribue to allow configuring the smps mode to be used by the ap (e.g. configuring to ap to dynamic smps mode will reduce power consumption while having minor effect on throughput) Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 33 +++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 24 ++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2ed1f80f7eb6..a2ddcf2398fd 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -664,6 +664,7 @@ struct cfg80211_acl_data { * @crypto: crypto settings * @privacy: the BSS uses privacy * @auth_type: Authentication type (algorithm) + * @smps_mode: SMPS mode * @inactivity_timeout: time in seconds to determine station's inactivity. * @p2p_ctwindow: P2P CT Window * @p2p_opp_ps: P2P opportunistic PS @@ -682,6 +683,7 @@ struct cfg80211_ap_settings { struct cfg80211_crypto_settings crypto; bool privacy; enum nl80211_auth_type auth_type; + enum nl80211_smps_mode smps_mode; int inactivity_timeout; u8 p2p_ctwindow; bool p2p_opp_ps; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e5b8caf5e466..4b28dc07bcb1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1635,6 +1635,9 @@ enum nl80211_commands { * @NL80211_ATTR_ADMITTED_TIME: admitted time in units of 32 microseconds * (per second) (u16 attribute) * + * @NL80211_ATTR_SMPS_MODE: SMPS mode to use (ap mode). see + * &enum nl80211_smps_mode. + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -1985,6 +1988,8 @@ enum nl80211_attrs { NL80211_ATTR_USER_PRIO, NL80211_ATTR_ADMITTED_TIME, + NL80211_ATTR_SMPS_MODE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4030,6 +4035,13 @@ enum nl80211_ap_sme_features { * @NL80211_FEATURE_ACKTO_ESTIMATION: This driver supports dynamic ACK timeout * estimation (dynack). %NL80211_ATTR_WIPHY_DYN_ACK flag attribute is used * to enable dynack. + * @NL80211_FEATURE_STATIC_SMPS: Device supports static spatial + * multiplexing powersave, ie. can turn off all but one chain + * even on HT connections that should be using more chains. + * @NL80211_FEATURE_DYNAMIC_SMPS: Device supports dynamic spatial + * multiplexing powersave, ie. can turn off all but one chain + * and then wake the rest up as required after, for example, + * rts/cts handshake. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4056,6 +4068,8 @@ enum nl80211_feature_flags { NL80211_FEATURE_QUIET = 1 << 21, NL80211_FEATURE_TX_POWER_INSERTION = 1 << 22, NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, + NL80211_FEATURE_STATIC_SMPS = 1 << 24, + NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, }; /** @@ -4129,6 +4143,25 @@ enum nl80211_acl_policy { NL80211_ACL_POLICY_DENY_UNLESS_LISTED, }; +/** + * enum nl80211_smps_mode - SMPS mode + * + * Requested SMPS mode (for AP mode) + * + * @NL80211_SMPS_OFF: SMPS off (use all antennas). + * @NL80211_SMPS_STATIC: static SMPS (use a single antenna) + * @NL80211_SMPS_DYNAMIC: dynamic smps (start with a single antenna and + * turn on other antennas after CTS/RTS). + */ +enum nl80211_smps_mode { + NL80211_SMPS_OFF, + NL80211_SMPS_STATIC, + NL80211_SMPS_DYNAMIC, + + __NL80211_SMPS_AFTER_LAST, + NL80211_SMPS_MAX = __NL80211_SMPS_AFTER_LAST - 1 +}; + /** * enum nl80211_radar_event - type of radar event for DFS operation * diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e8114c7a37e3..4cce3e17964d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -394,6 +394,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_TSID] = { .type = NLA_U8 }, [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, + [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -3345,6 +3346,29 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(params.acl); } + if (info->attrs[NL80211_ATTR_SMPS_MODE]) { + params.smps_mode = + nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); + switch (params.smps_mode) { + case NL80211_SMPS_OFF: + break; + case NL80211_SMPS_STATIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_STATIC_SMPS)) + return -EINVAL; + break; + case NL80211_SMPS_DYNAMIC: + if (!(rdev->wiphy.features & + NL80211_FEATURE_DYNAMIC_SMPS)) + return -EINVAL; + break; + default: + return -EINVAL; + } + } else { + params.smps_mode = NL80211_SMPS_OFF; + } + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { -- cgit v1.2.3 From 39e393bb4f653d38aea40190e1aa9a49062eed4d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 11 Sep 2014 11:02:39 +0200 Subject: netfilter: nf_tables: add NFTA_MASQ_UNSPEC to nft_masq_attributes To keep this consistent with other nft_*_attributes. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index eeec0ae845ef..66d66dd3ff79 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -806,6 +806,7 @@ enum nft_nat_attributes { * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) */ enum nft_masq_attributes { + NFTA_MASQ_UNSPEC, NFTA_MASQ_FLAGS, __NFTA_MASQ_MAX }; -- cgit v1.2.3 From e351943b081f4d9e6f692ce1a6117e8d2e71f478 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Fri, 5 Sep 2014 13:19:59 -0400 Subject: drm/vmwgfx: Fix drm.h include The userspace drm.h include doesn't prefix the drm directory. This can lead to compile failures as /usr/include/drm/ isn't in the standard gcc include paths. Fix it to be , which matches the rest of the driver drm header files that get installed into /usr/include/drm. Red Hat Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1138759 Fixes: 1d7a5cbf8f74e Reported-by: Jeffrey Bastian Signed-off-by: Josh Boyer Signed-off-by: Dave Airlie --- include/uapi/drm/vmwgfx_drm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h index 4fc66f6b12ce..c472bedbe38e 100644 --- a/include/uapi/drm/vmwgfx_drm.h +++ b/include/uapi/drm/vmwgfx_drm.h @@ -29,7 +29,7 @@ #define __VMWGFX_DRM_H__ #ifndef __KERNEL__ -#include +#include #endif #define DRM_VMW_MAX_SURFACE_FACES 6 -- cgit v1.2.3 From 0e9871e3f79fd17c691b50a9669220c54ff084a2 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Thu, 28 Aug 2014 10:11:27 +0400 Subject: netfilter: ipset: Add skbinfo extension kernel support in the ipset core. Skbinfo extension provides mapping of metainformation with lookup in the ipset tables. This patch defines the flags, the constants, the functions and the structures for the data type independent support of the extension. Note the firewall mark stores in the kernel structures as two 32bit values, but transfered through netlink as one 64bit value. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 56 ++++++++++++++++++++++++++++- include/uapi/linux/netfilter/ipset/ip_set.h | 12 +++++++ net/netfilter/ipset/ip_set_core.c | 27 +++++++++++++- 3 files changed, 93 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 96afc29184be..b97aac5142ed 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -57,6 +57,8 @@ enum ip_set_extension { IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER), IPSET_EXT_BIT_COMMENT = 2, IPSET_EXT_COMMENT = (1 << IPSET_EXT_BIT_COMMENT), + IPSET_EXT_BIT_SKBINFO = 3, + IPSET_EXT_SKBINFO = (1 << IPSET_EXT_BIT_SKBINFO), /* Mark set with an extension which needs to call destroy */ IPSET_EXT_BIT_DESTROY = 7, IPSET_EXT_DESTROY = (1 << IPSET_EXT_BIT_DESTROY), @@ -65,12 +67,14 @@ enum ip_set_extension { #define SET_WITH_TIMEOUT(s) ((s)->extensions & IPSET_EXT_TIMEOUT) #define SET_WITH_COUNTER(s) ((s)->extensions & IPSET_EXT_COUNTER) #define SET_WITH_COMMENT(s) ((s)->extensions & IPSET_EXT_COMMENT) +#define SET_WITH_SKBINFO(s) ((s)->extensions & IPSET_EXT_SKBINFO) #define SET_WITH_FORCEADD(s) ((s)->flags & IPSET_CREATE_FLAG_FORCEADD) /* Extension id, in size order */ enum ip_set_ext_id { IPSET_EXT_ID_COUNTER = 0, IPSET_EXT_ID_TIMEOUT, + IPSET_EXT_ID_SKBINFO, IPSET_EXT_ID_COMMENT, IPSET_EXT_ID_MAX, }; @@ -92,6 +96,10 @@ struct ip_set_ext { u64 packets; u64 bytes; u32 timeout; + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; char *comment; }; @@ -104,6 +112,13 @@ struct ip_set_comment { char *str; }; +struct ip_set_skbinfo { + u32 skbmark; + u32 skbmarkmask; + u32 skbprio; + u16 skbqueue; +}; + struct ip_set; #define ext_timeout(e, s) \ @@ -112,7 +127,8 @@ struct ip_set; (struct ip_set_counter *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COUNTER]) #define ext_comment(e, s) \ (struct ip_set_comment *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_COMMENT]) - +#define ext_skbinfo(e, s) \ +(struct ip_set_skbinfo *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_SKBINFO]) typedef int (*ipset_adtfn)(struct ip_set *set, void *value, const struct ip_set_ext *ext, @@ -256,6 +272,8 @@ ip_set_put_flags(struct sk_buff *skb, struct ip_set *set) cadt_flags |= IPSET_FLAG_WITH_COUNTERS; if (SET_WITH_COMMENT(set)) cadt_flags |= IPSET_FLAG_WITH_COMMENT; + if (SET_WITH_SKBINFO(set)) + cadt_flags |= IPSET_FLAG_WITH_SKBINFO; if (SET_WITH_FORCEADD(set)) cadt_flags |= IPSET_FLAG_WITH_FORCEADD; @@ -304,6 +322,39 @@ ip_set_update_counter(struct ip_set_counter *counter, } } +static inline void +ip_set_get_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags) +{ + mext->skbmark = skbinfo->skbmark; + mext->skbmarkmask = skbinfo->skbmarkmask; + mext->skbprio = skbinfo->skbprio; + mext->skbqueue = skbinfo->skbqueue; +} +static inline bool +ip_set_put_skbinfo(struct sk_buff *skb, struct ip_set_skbinfo *skbinfo) +{ + return nla_put_net64(skb, IPSET_ATTR_SKBMARK, + cpu_to_be64((u64)skbinfo->skbmark << 32 | + skbinfo->skbmarkmask)) || + nla_put_net32(skb, IPSET_ATTR_SKBPRIO, + cpu_to_be32(skbinfo->skbprio)) || + nla_put_net16(skb, IPSET_ATTR_SKBQUEUE, + cpu_to_be16(skbinfo->skbqueue)); + +} + +static inline void +ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, + const struct ip_set_ext *ext) +{ + skbinfo->skbmark = ext->skbmark; + skbinfo->skbmarkmask = ext->skbmarkmask; + skbinfo->skbprio = ext->skbprio; + skbinfo->skbqueue = ext->skbqueue; +} + static inline bool ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter) { @@ -497,6 +548,9 @@ ip_set_put_extensions(struct sk_buff *skb, const struct ip_set *set, if (SET_WITH_COMMENT(set) && ip_set_put_comment(skb, ext_comment(e, set))) return -EMSGSIZE; + if (SET_WITH_SKBINFO(set) && + ip_set_put_skbinfo(skb, ext_skbinfo(e, set))) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 78c2f2e79920..ca03119111a2 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -115,6 +115,9 @@ enum { IPSET_ATTR_BYTES, IPSET_ATTR_PACKETS, IPSET_ATTR_COMMENT, + IPSET_ATTR_SKBMARK, + IPSET_ATTR_SKBPRIO, + IPSET_ATTR_SKBQUEUE, __IPSET_ATTR_ADT_MAX, }; #define IPSET_ATTR_ADT_MAX (__IPSET_ATTR_ADT_MAX - 1) @@ -147,6 +150,7 @@ enum ipset_errno { IPSET_ERR_COUNTER, IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, + IPSET_ERR_SKBINFO, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, @@ -170,6 +174,12 @@ enum ipset_cmd_flags { IPSET_FLAG_MATCH_COUNTERS = (1 << IPSET_FLAG_BIT_MATCH_COUNTERS), IPSET_FLAG_BIT_RETURN_NOMATCH = 7, IPSET_FLAG_RETURN_NOMATCH = (1 << IPSET_FLAG_BIT_RETURN_NOMATCH), + IPSET_FLAG_BIT_MAP_SKBMARK = 8, + IPSET_FLAG_MAP_SKBMARK = (1 << IPSET_FLAG_BIT_MAP_SKBMARK), + IPSET_FLAG_BIT_MAP_SKBPRIO = 9, + IPSET_FLAG_MAP_SKBPRIO = (1 << IPSET_FLAG_BIT_MAP_SKBPRIO), + IPSET_FLAG_BIT_MAP_SKBQUEUE = 10, + IPSET_FLAG_MAP_SKBQUEUE = (1 << IPSET_FLAG_BIT_MAP_SKBQUEUE), IPSET_FLAG_CMD_MAX = 15, }; @@ -187,6 +197,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_COMMENT = (1 << IPSET_FLAG_BIT_WITH_COMMENT), IPSET_FLAG_BIT_WITH_FORCEADD = 5, IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), + IPSET_FLAG_BIT_WITH_SKBINFO = 6, + IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), IPSET_FLAG_CADT_MAX = 15, }; diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 4ca4e5ca6f57..26c795e6b57f 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -337,6 +337,12 @@ const struct ip_set_ext_type ip_set_extensions[] = { .len = sizeof(unsigned long), .align = __alignof__(unsigned long), }, + [IPSET_EXT_ID_SKBINFO] = { + .type = IPSET_EXT_SKBINFO, + .flag = IPSET_FLAG_WITH_SKBINFO, + .len = sizeof(struct ip_set_skbinfo), + .align = __alignof__(struct ip_set_skbinfo), + }, [IPSET_EXT_ID_COMMENT] = { .type = IPSET_EXT_COMMENT | IPSET_EXT_DESTROY, .flag = IPSET_FLAG_WITH_COMMENT, @@ -382,6 +388,7 @@ int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], struct ip_set_ext *ext) { + u64 fullmark; if (tb[IPSET_ATTR_TIMEOUT]) { if (!(set->extensions & IPSET_EXT_TIMEOUT)) return -IPSET_ERR_TIMEOUT; @@ -402,7 +409,25 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[], return -IPSET_ERR_COMMENT; ext->comment = ip_set_comment_uget(tb[IPSET_ATTR_COMMENT]); } - + if (tb[IPSET_ATTR_SKBMARK]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + fullmark = be64_to_cpu(nla_get_be64(tb[IPSET_ATTR_SKBMARK])); + ext->skbmark = fullmark >> 32; + ext->skbmarkmask = fullmark & 0xffffffff; + } + if (tb[IPSET_ATTR_SKBPRIO]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbprio = be32_to_cpu(nla_get_be32( + tb[IPSET_ATTR_SKBPRIO])); + } + if (tb[IPSET_ATTR_SKBQUEUE]) { + if (!(set->extensions & IPSET_EXT_SKBINFO)) + return -IPSET_ERR_SKBINFO; + ext->skbqueue = be16_to_cpu(nla_get_be16( + tb[IPSET_ATTR_SKBQUEUE])); + } return 0; } EXPORT_SYMBOL_GPL(ip_set_get_extensions); -- cgit v1.2.3 From 76cea4109ca89dea218fdc652d2e1535fd9b5fc7 Mon Sep 17 00:00:00 2001 From: Anton Danilov Date: Tue, 2 Sep 2014 14:21:20 +0400 Subject: netfilter: ipset: Add skbinfo extension support to SET target. Signed-off-by: Anton Danilov Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/xt_set.h | 10 +++ net/netfilter/xt_set.c | 155 ++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h index 964d3d42f874..d6a1df1f2947 100644 --- a/include/uapi/linux/netfilter/xt_set.h +++ b/include/uapi/linux/netfilter/xt_set.h @@ -71,4 +71,14 @@ struct xt_set_info_match_v3 { __u32 flags; }; +/* Revision 3 target */ + +struct xt_set_info_target_v3 { + struct xt_set_info add_set; + struct xt_set_info del_set; + struct xt_set_info map_set; + __u32 flags; + __u32 timeout; +}; + #endif /*_XT_SET_H*/ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index cb70f6ec5695..5732cd64acc0 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -366,6 +366,140 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par) #define set_target_v2_checkentry set_target_v1_checkentry #define set_target_v2_destroy set_target_v1_destroy +/* Revision 3 target */ + +static unsigned int +set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ADT_OPT(add_opt, par->family, info->add_set.dim, + info->add_set.flags, info->flags, info->timeout); + ADT_OPT(del_opt, par->family, info->del_set.dim, + info->del_set.flags, 0, UINT_MAX); + ADT_OPT(map_opt, par->family, info->map_set.dim, + info->map_set.flags, 0, UINT_MAX); + + int ret; + + /* Normalize to fit into jiffies */ + if (add_opt.ext.timeout != IPSET_NO_TIMEOUT && + add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC) + add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC; + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_add(info->add_set.index, skb, par, &add_opt); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_del(info->del_set.index, skb, par, &del_opt); + if (info->map_set.index != IPSET_INVALID_ID) { + map_opt.cmdflags |= info->flags & (IPSET_FLAG_MAP_SKBMARK | + IPSET_FLAG_MAP_SKBPRIO | + IPSET_FLAG_MAP_SKBQUEUE); + ret = match_set(info->map_set.index, skb, par, &map_opt, + info->map_set.flags & IPSET_INV_MATCH); + if (!ret) + return XT_CONTINUE; + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBMARK) + skb->mark = (skb->mark & ~(map_opt.ext.skbmarkmask)) + ^ (map_opt.ext.skbmark); + if (map_opt.cmdflags & IPSET_FLAG_MAP_SKBPRIO) + skb->priority = map_opt.ext.skbprio; + if ((map_opt.cmdflags & IPSET_FLAG_MAP_SKBQUEUE) && + skb->dev && + skb->dev->real_num_tx_queues > map_opt.ext.skbqueue) + skb_set_queue_mapping(skb, map_opt.ext.skbqueue); + } + return XT_CONTINUE; +} + + +static int +set_target_v3_checkentry(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + ip_set_id_t index; + + if (info->add_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->add_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find add_set index %u as target\n", + info->add_set.index); + return -ENOENT; + } + } + + if (info->del_set.index != IPSET_INVALID_ID) { + index = ip_set_nfnl_get_byindex(par->net, + info->del_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find del_set index %u as target\n", + info->del_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + return -ENOENT; + } + } + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_warn("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + !(par->hook_mask & (1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_warn("mapping of prio or/and queue is allowed only" + "from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + index = ip_set_nfnl_get_byindex(par->net, + info->map_set.index); + if (index == IPSET_INVALID_ID) { + pr_warn("Cannot find map_set index %u as target\n", + info->map_set.index); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, + info->del_set.index); + return -ENOENT; + } + } + + if (info->add_set.dim > IPSET_DIM_MAX || + info->del_set.dim > IPSET_DIM_MAX || + info->map_set.dim > IPSET_DIM_MAX) { + pr_warn("Protocol error: SET target dimension " + "is over the limit!\n"); + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); + return -ERANGE; + } + + return 0; +} + +static void +set_target_v3_destroy(const struct xt_tgdtor_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->add_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->add_set.index); + if (info->del_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->del_set.index); + if (info->map_set.index != IPSET_INVALID_ID) + ip_set_nfnl_put(par->net, info->map_set.index); +} + + static struct xt_match set_matches[] __read_mostly = { { .name = "set", @@ -493,6 +627,27 @@ static struct xt_target set_targets[] __read_mostly = { .destroy = set_target_v2_destroy, .me = THIS_MODULE }, + /* --map-set support */ + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV4, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, + { + .name = "SET", + .revision = 3, + .family = NFPROTO_IPV6, + .target = set_target_v3, + .targetsize = sizeof(struct xt_set_info_target_v3), + .checkentry = set_target_v3_checkentry, + .destroy = set_target_v3_destroy, + .me = THIS_MODULE + }, }; static int __init xt_set_init(void) -- cgit v1.2.3 From 6cff339bbd5f9eda7a5e8a521f91a88d046e6d0c Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Tue, 9 Sep 2014 16:40:20 -0700 Subject: ipvs: Add destination address family to netlink interface This is necessary to support heterogeneous pools. For example, if you have an ipv6 addressed network, you'll want to be able to forward ipv4 traffic into it. This patch enforces that destination address family is the same as service family, as none of the forwarding mechanisms support anything else. For the old setsockopt mechanism, we simply set the dest address family to AF_INET as we do with the service. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 3 +++ include/uapi/linux/ip_vs.h | 3 +++ net/netfilter/ipvs/ip_vs_ctl.c | 45 +++++++++++++++++++++++++++++++++++------- 3 files changed, 44 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 624a8a54806d..b7e2b624d58e 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -648,6 +648,9 @@ struct ip_vs_dest_user_kern { /* thresholds for active connections */ u32 u_threshold; /* upper threshold */ u32 l_threshold; /* lower threshold */ + + /* Address family of addr */ + u16 af; }; diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index fbcffe8041f7..cabe95d5b461 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -384,6 +384,9 @@ enum { IPVS_DEST_ATTR_PERSIST_CONNS, /* persistent connections */ IPVS_DEST_ATTR_STATS, /* nested attribute for dest stats */ + + IPVS_DEST_ATTR_ADDR_FAMILY, /* Address family of address */ + __IPVS_DEST_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index bd2b208ba56c..594cec7ebc43 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -816,6 +816,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + dest->af = udest->af; + spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); @@ -846,8 +848,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, EnterFunction(2); + /* Temporary for consistency */ + if (udest->af != svc->af) + return -EINVAL; + #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { + if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -875,12 +881,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, u64_stats_init(&ip_vs_dest_stats->syncp); } - dest->af = svc->af; + dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; - ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); @@ -928,7 +934,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -949,7 +955,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", - IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), @@ -992,7 +998,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ERANGE; } - ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); @@ -2244,6 +2250,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; + udest->af = AF_INET; } static int @@ -2480,6 +2487,12 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, if (count >= get->num_dests) break; + /* Cannot expose heterogeneous members via sockopt + * interface + */ + if (dest->af != svc->af) + continue; + entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); @@ -2777,6 +2790,7 @@ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, + [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, @@ -3032,7 +3046,8 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, - atomic_read(&dest->persistconns))) + atomic_read(&dest->persistconns)) || + nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) goto nla_put_failure; @@ -3113,6 +3128,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; + struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || @@ -3121,6 +3137,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; + nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; @@ -3130,6 +3147,11 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); + if (nla_addr_family) + udest->af = nla_get_u16(nla_addr_family); + else + udest->af = 0; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, @@ -3357,6 +3379,15 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) need_full_dest); if (ret) goto out; + + /* Old protocols did not allow the user to specify address + * family, so we set it to zero instead. We also didn't + * allow heterogeneous pools in the old code, so it's safe + * to assume that this will have the same address family as + * the service. + */ + if (udest.af == 0) + udest.af = svc->af; } switch (cmd) { -- cgit v1.2.3 From 971427f353f3c42c8dcef62e7124440df68eb809 Mon Sep 17 00:00:00 2001 From: Andy Zhou Date: Mon, 15 Sep 2014 19:37:25 -0700 Subject: openvswitch: Add recirc and hash action. Recirc action allows a packet to reenter openvswitch processing. currently openvswitch lookup flow for packet received and execute set of actions on that packet, with help of recirc action we can process/modify the packet and recirculate it back in openvswitch for another pass. OVS hash action calculates 5-tupple hash and set hash in flow-key hash. This can be used along with recirculation for distributing packets among different ports for bond devices. For example: OVS bonding can use following actions: Match on: bond flow; Action: hash, recirc(id) Match on: recirc-id == id and hash lower bits == a; Action: output port_bond_a Signed-off-by: Andy Zhou Acked-by: Jesse Gross Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 26 +++++ net/openvswitch/actions.c | 203 +++++++++++++++++++++++++++++++++++++-- net/openvswitch/datapath.c | 11 ++- net/openvswitch/datapath.h | 7 +- net/openvswitch/flow.c | 7 +- net/openvswitch/flow.h | 3 + net/openvswitch/flow_netlink.c | 43 ++++++++- 7 files changed, 288 insertions(+), 12 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index a794d1dd7b40..f7fc507d82ab 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -289,6 +289,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ + OVS_KEY_ATTR_DP_HASH, /* u32 hash value. Value 0 indicates the hash + is not computed by the datapath. */ + OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ @@ -493,6 +496,27 @@ struct ovs_action_push_vlan { __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; +/* Data path hash algorithm for computing Datapath hash. + * + * The algorithm type only specifies the fields in a flow + * will be used as part of the hash. Each datapath is free + * to use its own hash algorithm. The hash value will be + * opaque to the user space daemon. + */ +enum ovs_hash_alg { + OVS_HASH_ALG_L4, +}; + +/* + * struct ovs_action_hash - %OVS_ACTION_ATTR_HASH action argument. + * @hash_alg: Algorithm used to compute hash prior to recirculation. + * @hash_basis: basis used for computing hash. + */ +struct ovs_action_hash { + uint32_t hash_alg; /* One of ovs_hash_alg. */ + uint32_t hash_basis; +}; + /** * enum ovs_action_attr - Action types. * @@ -521,6 +545,8 @@ enum ovs_action_attr { OVS_ACTION_ATTR_PUSH_VLAN, /* struct ovs_action_push_vlan. */ OVS_ACTION_ATTR_POP_VLAN, /* No argument. */ OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ + OVS_ACTION_ATTR_RECIRC, /* u32 recirc_id. */ + OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ __OVS_ACTION_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c31bb80c984f..6932a42e41a2 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -35,12 +35,78 @@ #include #include "datapath.h" +#include "flow.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len); +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} + static int make_writable(struct sk_buff *skb, int write_len) { if (!pskb_may_pull(skb, write_len)) @@ -485,8 +551,29 @@ static int sample(struct datapath *dp, struct sk_buff *skb, /* Skip the sample action when out of memory. */ return 0; - /* do_execute_actions() will consume the cloned skb. */ - return do_execute_actions(dp, skb, key, a, rem); + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; } static int execute_set_action(struct sk_buff *skb, @@ -535,6 +622,44 @@ static int execute_set_action(struct sk_buff *skb, return err; } +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + + if (!last_action(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, @@ -566,6 +691,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, key, a); break; + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -576,6 +705,17 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = pop_vlan(skb); break; + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (last_action(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, nla_data(a)); break; @@ -601,12 +741,63 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return 0; } +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + /* Execute a list of actions against 'skb'. */ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key) { - struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + int level = this_cpu_read(exec_actions_level); + struct sw_flow_actions *acts; + int err; + + acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + + this_cpu_inc(exec_actions_level); + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); + + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; - return do_execute_actions(dp, skb, key, - acts->actions, acts->actions_len); + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 7e0819919b8a..16cad14fa81e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -156,7 +156,7 @@ static struct datapath *get_dp(struct net *net, int dp_ifindex) } /* Must be called with rcu_read_lock or ovs_mutex. */ -static const char *ovs_dp_name(const struct datapath *dp) +const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); return vport->ops->get_name(vport); @@ -2065,10 +2065,14 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = ovs_internal_dev_rtnl_link_register(); + err = action_fifos_init(); if (err) goto error; + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + err = ovs_flow_init(); if (err) goto error_unreg_rtnl_link; @@ -2101,6 +2105,8 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); error: return err; } @@ -2114,6 +2120,7 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 25b0e888cb27..ac3f3df96961 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2012 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -189,13 +189,18 @@ void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct dp_upcall_info *); +const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *); + void ovs_dp_notify_wq(struct work_struct *work); +int action_fifos_init(void); +void action_fifos_exit(void); + #define OVS_NLERR(fmt, ...) \ do { \ if (net_ratelimit()) \ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bf8442071d75..4010423f2831 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -606,6 +606,11 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return 0; } +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 3869a540365c..0f5db4ec565d 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -72,6 +72,8 @@ struct sw_flow_key { u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ } __packed phy; /* Safe when right after 'tun_key'. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ @@ -187,6 +189,7 @@ void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 630b320fbf3e..f4c8daa73965 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2013 Nicira, Inc. + * Copyright (c) 2007-2014 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -251,6 +251,8 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), + [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), + [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, }; @@ -454,6 +456,20 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { SW_FLOW_KEY_PUT(match, phy.priority, nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); @@ -873,6 +889,12 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, struct nlattr *nla, *encap; bool is_mask = (swkey != output); + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; @@ -1401,11 +1423,13 @@ int ovs_nla_copy_actions(const struct nlattr *attr, /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -1432,6 +1456,18 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } case OVS_ACTION_ATTR_POP_VLAN: break; @@ -1444,6 +1480,9 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_RECIRC: + break; + case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, &skip_copy); if (err) -- cgit v1.2.3 From 1b0bf88fd8b845aef4300c7c0feca774265dd1c4 Mon Sep 17 00:00:00 2001 From: Robert Baldyga Date: Tue, 9 Sep 2014 08:23:17 +0200 Subject: usb: gadget: f_fs: virtual endpoint address mapping This patch introduces virtual endpoint address mapping. It separates function logic form physical endpoint addresses making it more hardware independent. Following modifications changes user space API, so to enable them user have to switch on the FUNCTIONFS_VIRTUAL_ADDR flag in descriptors. Endpoints are now refered using virtual endpoint addresses chosen by user in endpoint descpriptors. This applies to each context when endpoint address can be used: - when accessing endpoint files in FunctionFS filesystemi (in file name), - in setup requests directed to specific endpoint (in wIndex field), - in descriptors returned by FUNCTIONFS_ENDPOINT_DESC ioctl. In endpoint file names the endpoint address number is formatted as double-digit hexadecimal value ("ep%02x") which has few advantages - it is easy to parse, allows to easly recognize endpoint direction basing on its name (IN endpoint number starts with digit 8, and OUT with 0) which can be useful for debugging purpose, and it makes easier to introduce further features allowing to use each endpoint number in both directions to have more endpoints available for function if hardware supports this (for example we could have ep01 which is endpoint 1 with OUT direction, and ep81 which is endpoint 1 with IN direction). Physical endpoint address can be still obtained using ioctl named FUNCTIONFS_ENDPOINT_REVMAP, but now it's not neccesary to handle USB transactions properly. Signed-off-by: Robert Baldyga Acked-by: Michal Nazarewicz Signed-off-by: Felipe Balbi --- drivers/usb/gadget/function/f_fs.c | 23 +++++++++++++++++++++-- drivers/usb/gadget/function/u_fs.h | 2 ++ include/uapi/linux/usb/functionfs.h | 1 + 3 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index e4afe8df5553..4ad11e03cf54 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1557,7 +1557,10 @@ static int ffs_epfiles_create(struct ffs_data *ffs) epfile->ffs = ffs; mutex_init(&epfile->mutex); init_waitqueue_head(&epfile->wait); - sprintf(epfiles->name, "ep%u", i); + if (ffs->user_flags & FUNCTIONFS_VIRTUAL_ADDR) + sprintf(epfiles->name, "ep%02x", ffs->eps_addrmap[i]); + else + sprintf(epfiles->name, "ep%u", i); if (!unlikely(ffs_sb_create_file(ffs->sb, epfiles->name, epfile, &ffs_epfile_operations, &epfile->dentry))) { @@ -2106,10 +2109,12 @@ static int __ffs_data_got_descs(struct ffs_data *ffs, break; case FUNCTIONFS_DESCRIPTORS_MAGIC_V2: flags = get_unaligned_le32(data + 8); + ffs->user_flags = flags; if (flags & ~(FUNCTIONFS_HAS_FS_DESC | FUNCTIONFS_HAS_HS_DESC | FUNCTIONFS_HAS_SS_DESC | - FUNCTIONFS_HAS_MS_OS_DESC)) { + FUNCTIONFS_HAS_MS_OS_DESC | + FUNCTIONFS_VIRTUAL_ADDR)) { ret = -ENOSYS; goto error; } @@ -2466,7 +2471,13 @@ static int __ffs_func_bind_do_descs(enum ffs_entity_type type, u8 *valuep, } else { struct usb_request *req; struct usb_ep *ep; + u8 bEndpointAddress; + /* + * We back up bEndpointAddress because autoconfig overwrites + * it with physical endpoint address. + */ + bEndpointAddress = ds->bEndpointAddress; pr_vdebug("autoconfig\n"); ep = usb_ep_autoconfig(func->gadget, ds); if (unlikely(!ep)) @@ -2481,6 +2492,12 @@ static int __ffs_func_bind_do_descs(enum ffs_entity_type type, u8 *valuep, ffs_ep->req = req; func->eps_revmap[ds->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK] = idx + 1; + /* + * If we use virtual address mapping, we restore + * original bEndpointAddress value. + */ + if (func->ffs->user_flags & FUNCTIONFS_VIRTUAL_ADDR) + ds->bEndpointAddress = bEndpointAddress; } ffs_dump_mem(": Rewritten ep desc", ds, ds->bLength); @@ -2925,6 +2942,8 @@ static int ffs_func_setup(struct usb_function *f, ret = ffs_func_revmap_ep(func, le16_to_cpu(creq->wIndex)); if (unlikely(ret < 0)) return ret; + if (func->ffs->user_flags & FUNCTIONFS_VIRTUAL_ADDR) + ret = func->ffs->eps_addrmap[ret]; break; default: diff --git a/drivers/usb/gadget/function/u_fs.h b/drivers/usb/gadget/function/u_fs.h index d48897e8ffeb..cd128e31f808 100644 --- a/drivers/usb/gadget/function/u_fs.h +++ b/drivers/usb/gadget/function/u_fs.h @@ -224,6 +224,8 @@ struct ffs_data { void *ms_os_descs_ext_prop_name_avail; void *ms_os_descs_ext_prop_data_avail; + unsigned user_flags; + u8 eps_addrmap[15]; unsigned short strings_count; diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h index 7c7a2feb0e6e..295ba299e7bd 100644 --- a/include/uapi/linux/usb/functionfs.h +++ b/include/uapi/linux/usb/functionfs.h @@ -19,6 +19,7 @@ enum functionfs_flags { FUNCTIONFS_HAS_HS_DESC = 2, FUNCTIONFS_HAS_SS_DESC = 4, FUNCTIONFS_HAS_MS_OS_DESC = 8, + FUNCTIONFS_VIRTUAL_ADDR = 16, }; /* Descriptor of an non-audio endpoint */ -- cgit v1.2.3 From d60eacb07053142bfb9b41582074a89a790a9d46 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 2 Sep 2014 10:27:33 +0100 Subject: KVM: device: add simple registration mechanism for kvm_device_ops kvm_ioctl_create_device currently has knowledge of all the device types and their associated ops. This is fairly inflexible when adding support for new in-kernel device emulations, so move what we currently have out into a table, which can support dynamic registration of ops by new drivers for virtual hardware. Cc: Alex Williamson Cc: Alex Graf Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Marc Zyngier Acked-by: Cornelia Huck Reviewed-by: Christoffer Dall Signed-off-by: Will Deacon Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + include/uapi/linux/kvm.h | 22 +++++++++++----- virt/kvm/kvm_main.c | 65 ++++++++++++++++++++++++++++-------------------- 3 files changed, 55 insertions(+), 33 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e098dce179df..b6e954742951 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1064,6 +1064,7 @@ struct kvm_device_ops { void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0695a1e3e332..60768822b140 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -943,15 +943,25 @@ struct kvm_device_attr { __u64 addr; /* userspace address of attr data */ }; -#define KVM_DEV_TYPE_FSL_MPIC_20 1 -#define KVM_DEV_TYPE_FSL_MPIC_42 2 -#define KVM_DEV_TYPE_XICS 3 -#define KVM_DEV_TYPE_VFIO 4 #define KVM_DEV_VFIO_GROUP 1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 -#define KVM_DEV_TYPE_ARM_VGIC_V2 5 -#define KVM_DEV_TYPE_FLIC 6 + +enum kvm_device_type { + KVM_DEV_TYPE_FSL_MPIC_20 = 1, +#define KVM_DEV_TYPE_FSL_MPIC_20 KVM_DEV_TYPE_FSL_MPIC_20 + KVM_DEV_TYPE_FSL_MPIC_42, +#define KVM_DEV_TYPE_FSL_MPIC_42 KVM_DEV_TYPE_FSL_MPIC_42 + KVM_DEV_TYPE_XICS, +#define KVM_DEV_TYPE_XICS KVM_DEV_TYPE_XICS + KVM_DEV_TYPE_VFIO, +#define KVM_DEV_TYPE_VFIO KVM_DEV_TYPE_VFIO + KVM_DEV_TYPE_ARM_VGIC_V2, +#define KVM_DEV_TYPE_ARM_VGIC_V2 KVM_DEV_TYPE_ARM_VGIC_V2 + KVM_DEV_TYPE_FLIC, +#define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC + KVM_DEV_TYPE_MAX, +}; /* * ioctls for VM fds diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c338599804e0..686d783387a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2272,44 +2272,55 @@ struct kvm_device *kvm_device_from_filp(struct file *filp) return filp->private_data; } -static int kvm_ioctl_create_device(struct kvm *kvm, - struct kvm_create_device *cd) -{ - struct kvm_device_ops *ops = NULL; - struct kvm_device *dev; - bool test = cd->flags & KVM_CREATE_DEVICE_TEST; - int ret; - - switch (cd->type) { +static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { #ifdef CONFIG_KVM_MPIC - case KVM_DEV_TYPE_FSL_MPIC_20: - case KVM_DEV_TYPE_FSL_MPIC_42: - ops = &kvm_mpic_ops; - break; + [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, + [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, #endif + #ifdef CONFIG_KVM_XICS - case KVM_DEV_TYPE_XICS: - ops = &kvm_xics_ops; - break; + [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, #endif + #ifdef CONFIG_KVM_VFIO - case KVM_DEV_TYPE_VFIO: - ops = &kvm_vfio_ops; - break; + [KVM_DEV_TYPE_VFIO] = &kvm_vfio_ops, #endif + #ifdef CONFIG_KVM_ARM_VGIC - case KVM_DEV_TYPE_ARM_VGIC_V2: - ops = &kvm_arm_vgic_v2_ops; - break; + [KVM_DEV_TYPE_ARM_VGIC_V2] = &kvm_arm_vgic_v2_ops, #endif + #ifdef CONFIG_S390 - case KVM_DEV_TYPE_FLIC: - ops = &kvm_flic_ops; - break; + [KVM_DEV_TYPE_FLIC] = &kvm_flic_ops, #endif - default: +}; + +int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) +{ + if (type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENOSPC; + + if (kvm_device_ops_table[type] != NULL) + return -EEXIST; + + kvm_device_ops_table[type] = ops; + return 0; +} + +static int kvm_ioctl_create_device(struct kvm *kvm, + struct kvm_create_device *cd) +{ + struct kvm_device_ops *ops = NULL; + struct kvm_device *dev; + bool test = cd->flags & KVM_CREATE_DEVICE_TEST; + int ret; + + if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) + return -ENODEV; + + ops = kvm_device_ops_table[cd->type]; + if (ops == NULL) return -ENODEV; - } if (test) return 0; -- cgit v1.2.3 From 84d7fce693884897c6196cc98228a2ad56ae2a9a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 4 Sep 2014 14:30:22 +0200 Subject: netfilter: nf_tables: export rule-set generation ID This patch exposes the ruleset generation ID in three ways: 1) The new command NFT_MSG_GETGEN that exposes the 32-bits ruleset generation ID. This ID is incremented in every commit and it should be large enough to avoid wraparound problems. 2) The less significant 16-bits of the generation ID are exposed through the nfgenmsg->res_id header field. This allows us to quickly catch if the ruleset has change between two consecutive list dumps from different object lists (in this specific case I think the risk of wraparound is unlikely). 3) Userspace subscribers may receive notifications of new rule-set generation after every commit. This also provides an alternative way to monitor the generation ID. If the events are lost, the userspace process hits a overrun error, so it knows that it is working with a stale ruleset anyway. Patrick spotted that rule-set transformations in userspace may take quite some time. In that case, it annotates the 32-bits generation ID before fetching the rule-set, then: 1) it compares it to what we obtain after the transformation to make sure it is not working with a stale rule-set and no wraparound has ocurred. 2) it subscribes to ruleset notifications, so it can watch for new generation ID. This is complementary to the NLM_F_DUMP_INTR approach, which allows us to detect an interference in the middle one single list dumping. There is no way to explicitly check that an interference has occurred between two list dumps from the kernel, since it doesn't know how many lists the userspace client is actually going to dump. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 16 ++++ net/netfilter/nf_tables_api.c | 140 +++++++++++++++++++++++++------ 2 files changed, 130 insertions(+), 26 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 66d66dd3ff79..b72ccfeaf865 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -51,6 +51,8 @@ enum nft_verdicts { * @NFT_MSG_NEWSETELEM: create a new set element (enum nft_set_elem_attributes) * @NFT_MSG_GETSETELEM: get a set element (enum nft_set_elem_attributes) * @NFT_MSG_DELSETELEM: delete a set element (enum nft_set_elem_attributes) + * @NFT_MSG_NEWGEN: announce a new generation, only for events (enum nft_gen_attributes) + * @NFT_MSG_GETGEN: get the rule-set generation (enum nft_gen_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -68,6 +70,8 @@ enum nf_tables_msg_types { NFT_MSG_NEWSETELEM, NFT_MSG_GETSETELEM, NFT_MSG_DELSETELEM, + NFT_MSG_NEWGEN, + NFT_MSG_GETGEN, NFT_MSG_MAX, }; @@ -812,4 +816,16 @@ enum nft_masq_attributes { }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) +/** + * enum nft_gen_attributes - nf_tables ruleset generation attributes + * + * @NFTA_GEN_ID: Ruleset generation ID (NLA_U32) + */ +enum nft_gen_attributes { + NFTA_GEN_UNSPEC, + NFTA_GEN_ID, + __NFTA_GEN_MAX +}; +#define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) + #endif /* _LINUX_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 82374601577e..a476b9962155 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -405,9 +405,9 @@ static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { [NFTA_TABLE_FLAGS] = { .type = NLA_U32 }, }; -static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table) +static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table) { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -420,7 +420,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || @@ -448,8 +448,8 @@ static int nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_table_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -488,7 +488,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_table_info(skb, + if (nf_tables_fill_table_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, @@ -540,7 +540,7 @@ static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_table_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, family, table); if (err < 0) @@ -914,9 +914,9 @@ nla_put_failure: return -ENOSPC; } -static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, - const struct nft_table *table, +static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, u32 flags, + int family, const struct nft_table *table, const struct nft_chain *chain) { struct nlmsghdr *nlh; @@ -930,7 +930,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; @@ -988,8 +988,8 @@ static int nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; - err = nf_tables_fill_chain_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -1031,7 +1031,8 @@ static int nf_tables_dump_chains(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_chain_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_chain_info(skb, net, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, NLM_F_MULTI, @@ -1090,7 +1091,7 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_chain_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, family, table, chain); if (err < 0) @@ -1647,8 +1648,9 @@ static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { .len = NFT_USERDATA_MAXLEN }, }; -static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, - int event, u32 flags, int family, +static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq, int event, + u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, const struct nft_rule *rule) @@ -1668,7 +1670,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, u32 portid, u32 seq, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; @@ -1724,8 +1726,8 @@ static int nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_rule_info(skb, ctx->portid, ctx->seq, event, 0, - ctx->afi->family, ctx->table, + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, + event, 0, ctx->afi->family, ctx->table, ctx->chain, rule); if (err < 0) { kfree_skb(skb); @@ -1771,7 +1773,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (idx > s_idx) memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); - if (nf_tables_fill_rule_info(skb, NETLINK_CB(cb->skb).portid, + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, @@ -1837,7 +1839,7 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb, if (!skb2) return -ENOMEM; - err = nf_tables_fill_rule_info(skb2, NETLINK_CB(skb).portid, + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule); if (err < 0) @@ -2321,7 +2323,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -2925,7 +2927,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx.afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx.net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, ctx.table->name)) goto nla_put_failure; @@ -3006,7 +3008,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = ctx->afi->family; nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; + nfmsg->res_id = htons(ctx->net->nft.base_seq & 0xffff); if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; @@ -3293,6 +3295,87 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, return err; } +static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, + u32 portid, u32 seq) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); + if (nlh == NULL) + goto nla_put_failure; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = AF_UNSPEC; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = htons(net->nft.base_seq & 0xffff); + + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + goto nla_put_failure; + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_trim(skb, nlh); + return -EMSGSIZE; +} + +static int nf_tables_gen_notify(struct net *net, struct sk_buff *skb, int event) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + struct sk_buff *skb2; + int err; + + if (nlmsg_report(nlh) && + !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) + return 0; + + err = -ENOBUFS; + skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + goto err; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) { + kfree_skb(skb2); + goto err; + } + + err = nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, + NFNLGRP_NFTABLES, nlmsg_report(nlh), GFP_KERNEL); +err: + if (err < 0) { + nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES, + err); + } + return err; +} + +static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + const struct nlattr * const nla[]) +{ + struct net *net = sock_net(skb->sk); + struct sk_buff *skb2; + int err; + + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb2 == NULL) + return -ENOMEM; + + err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, + nlh->nlmsg_seq); + if (err < 0) + goto err; + + return nlmsg_unicast(nlsk, skb2, NETLINK_CB(skb).portid); +err: + kfree_skb(skb2); + return err; +} + static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { .call_batch = nf_tables_newtable, @@ -3369,6 +3452,9 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, + [NFT_MSG_GETGEN] = { + .call = nf_tables_getgen, + }, }; static void nft_chain_commit_update(struct nft_trans *trans) @@ -3526,6 +3612,8 @@ static int nf_tables_commit(struct sk_buff *skb) call_rcu(&trans->rcu_head, nf_tables_commit_release_rcu); } + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + return 0; } -- cgit v1.2.3 From d931589c01a20595d67192f075f9c84093c43c45 Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Wed, 17 Sep 2014 22:48:45 +0900 Subject: drm/exynos: remove DRM_EXYNOS_GEM_MAP_OFFSET ioctl This interface and relevant codes aren't used anymore. Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 3 --- drivers/gpu/drm/exynos/exynos_drm_gem.c | 17 ----------------- drivers/gpu/drm/exynos/exynos_drm_gem.h | 4 ---- include/uapi/drm/exynos_drm.h | 18 ------------------ 4 files changed, 42 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index 2103d970d6ea..d69bd9723805 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -299,9 +299,6 @@ static const struct vm_operations_struct exynos_drm_gem_vm_ops = { static const struct drm_ioctl_desc exynos_ioctls[] = { DRM_IOCTL_DEF_DRV(EXYNOS_GEM_CREATE, exynos_drm_gem_create_ioctl, DRM_UNLOCKED | DRM_AUTH), - DRM_IOCTL_DEF_DRV(EXYNOS_GEM_MAP_OFFSET, - exynos_drm_gem_map_offset_ioctl, DRM_UNLOCKED | - DRM_AUTH), DRM_IOCTL_DEF_DRV(EXYNOS_GEM_MMAP, exynos_drm_gem_mmap_ioctl, DRM_UNLOCKED | DRM_AUTH), DRM_IOCTL_DEF_DRV(EXYNOS_GEM_GET, diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 15db80138382..2f3665de2d60 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -318,23 +318,6 @@ void exynos_drm_gem_put_dma_addr(struct drm_device *dev, drm_gem_object_unreference_unlocked(obj); } -int exynos_drm_gem_map_offset_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv) -{ - struct drm_exynos_gem_map_off *args = data; - - DRM_DEBUG_KMS("handle = 0x%x, offset = 0x%lx\n", - args->handle, (unsigned long)args->offset); - - if (!(dev->driver->driver_features & DRIVER_GEM)) { - DRM_ERROR("does not support GEM.\n"); - return -ENODEV; - } - - return exynos_drm_gem_dumb_map_offset(file_priv, dev, args->handle, - &args->offset); -} - int exynos_drm_gem_mmap_buffer(struct file *filp, struct vm_area_struct *vma) { diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h index 1592c0ba7de8..8e460940d118 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.h +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h @@ -111,10 +111,6 @@ void exynos_drm_gem_put_dma_addr(struct drm_device *dev, unsigned int gem_handle, struct drm_file *filp); -/* get buffer offset to map to user space. */ -int exynos_drm_gem_map_offset_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv); - /* * mmap the physically continuous memory that a gem object contains * to user space. diff --git a/include/uapi/drm/exynos_drm.h b/include/uapi/drm/exynos_drm.h index d5844122ff32..67a751c74a77 100644 --- a/include/uapi/drm/exynos_drm.h +++ b/include/uapi/drm/exynos_drm.h @@ -32,20 +32,6 @@ struct drm_exynos_gem_create { unsigned int handle; }; -/** - * A structure for getting buffer offset. - * - * @handle: a pointer to gem object created. - * @pad: just padding to be 64-bit aligned. - * @offset: relatived offset value of the memory region allocated. - * - this value should be set by user. - */ -struct drm_exynos_gem_map_off { - unsigned int handle; - unsigned int pad; - uint64_t offset; -}; - /** * A structure for mapping buffer. * @@ -316,7 +302,6 @@ struct drm_exynos_ipp_cmd_ctrl { }; #define DRM_EXYNOS_GEM_CREATE 0x00 -#define DRM_EXYNOS_GEM_MAP_OFFSET 0x01 #define DRM_EXYNOS_GEM_MMAP 0x02 /* Reserved 0x03 ~ 0x05 for exynos specific gem ioctl */ #define DRM_EXYNOS_GEM_GET 0x04 @@ -336,9 +321,6 @@ struct drm_exynos_ipp_cmd_ctrl { #define DRM_IOCTL_EXYNOS_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + \ DRM_EXYNOS_GEM_CREATE, struct drm_exynos_gem_create) -#define DRM_IOCTL_EXYNOS_GEM_MAP_OFFSET DRM_IOWR(DRM_COMMAND_BASE + \ - DRM_EXYNOS_GEM_MAP_OFFSET, struct drm_exynos_gem_map_off) - #define DRM_IOCTL_EXYNOS_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + \ DRM_EXYNOS_GEM_MMAP, struct drm_exynos_gem_mmap) -- cgit v1.2.3 From 832316c704fe3d15ae6ca9a552ae80411d1bbbcd Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Thu, 18 Sep 2014 14:19:01 +0900 Subject: drm/exynos: use drm generic mmap interface This patch removes DRM_EXYNOS_GEM_MMAP ictrl feature specific to Exynos drm and instead uses drm generic mmap. We had used the interface specific to Exynos drm to do mmap directly, not to use demand paging which maps each page with physical memory at page fault handler. We don't need the specific mmap interface because the drm generic mmap which uses vm offset manager stuff can also do mmap directly. This patch makes a userspace region to be mapped with whole physical memory region allocated by userspace request when mmap system call is requested. Changelog v2: - do not set VM_IO, VM_DONTEXPEND and VM_DONTDUMP. These flags were already set by drm_gem_mmap - do not include , which isn't needed anymore. Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 26 ---------- drivers/gpu/drm/exynos/exynos_drm_drv.h | 1 - drivers/gpu/drm/exynos/exynos_drm_gem.c | 89 ++++++--------------------------- drivers/gpu/drm/exynos/exynos_drm_gem.h | 10 ---- include/uapi/drm/exynos_drm.h | 22 -------- 5 files changed, 16 insertions(+), 132 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index d69bd9723805..513ba940bae0 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -166,10 +165,6 @@ static int exynos_drm_unload(struct drm_device *dev) return 0; } -static const struct file_operations exynos_drm_gem_fops = { - .mmap = exynos_drm_gem_mmap_buffer, -}; - static int exynos_drm_suspend(struct drm_device *dev, pm_message_t state) { struct drm_connector *connector; @@ -208,7 +203,6 @@ static int exynos_drm_resume(struct drm_device *dev) static int exynos_drm_open(struct drm_device *dev, struct drm_file *file) { struct drm_exynos_file_private *file_priv; - struct file *anon_filp; int ret; file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL); @@ -221,21 +215,8 @@ static int exynos_drm_open(struct drm_device *dev, struct drm_file *file) if (ret) goto err_file_priv_free; - anon_filp = anon_inode_getfile("exynos_gem", &exynos_drm_gem_fops, - NULL, 0); - if (IS_ERR(anon_filp)) { - ret = PTR_ERR(anon_filp); - goto err_subdrv_close; - } - - anon_filp->f_mode = FMODE_READ | FMODE_WRITE; - file_priv->anon_filp = anon_filp; - return ret; -err_subdrv_close: - exynos_drm_subdrv_close(dev, file); - err_file_priv_free: kfree(file_priv); file->driver_priv = NULL; @@ -251,7 +232,6 @@ static void exynos_drm_preclose(struct drm_device *dev, static void exynos_drm_postclose(struct drm_device *dev, struct drm_file *file) { struct exynos_drm_private *private = dev->dev_private; - struct drm_exynos_file_private *file_priv; struct drm_pending_vblank_event *v, *vt; struct drm_pending_event *e, *et; unsigned long flags; @@ -277,10 +257,6 @@ static void exynos_drm_postclose(struct drm_device *dev, struct drm_file *file) } spin_unlock_irqrestore(&dev->event_lock, flags); - file_priv = file->driver_priv; - if (file_priv->anon_filp) - fput(file_priv->anon_filp); - kfree(file->driver_priv); file->driver_priv = NULL; } @@ -299,8 +275,6 @@ static const struct vm_operations_struct exynos_drm_gem_vm_ops = { static const struct drm_ioctl_desc exynos_ioctls[] = { DRM_IOCTL_DEF_DRV(EXYNOS_GEM_CREATE, exynos_drm_gem_create_ioctl, DRM_UNLOCKED | DRM_AUTH), - DRM_IOCTL_DEF_DRV(EXYNOS_GEM_MMAP, - exynos_drm_gem_mmap_ioctl, DRM_UNLOCKED | DRM_AUTH), DRM_IOCTL_DEF_DRV(EXYNOS_GEM_GET, exynos_drm_gem_get_ioctl, DRM_UNLOCKED), DRM_IOCTL_DEF_DRV(EXYNOS_VIDI_CONNECTION, diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.h b/drivers/gpu/drm/exynos/exynos_drm_drv.h index 69a6fa397d75..d22e640f59a0 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.h +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.h @@ -240,7 +240,6 @@ struct exynos_drm_g2d_private { struct drm_exynos_file_private { struct exynos_drm_g2d_private *g2d_priv; struct device *ipp_dev; - struct file *anon_filp; }; /* diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 2f3665de2d60..0d5b9698d384 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -318,23 +318,16 @@ void exynos_drm_gem_put_dma_addr(struct drm_device *dev, drm_gem_object_unreference_unlocked(obj); } -int exynos_drm_gem_mmap_buffer(struct file *filp, +int exynos_drm_gem_mmap_buffer(struct exynos_drm_gem_obj *exynos_gem_obj, struct vm_area_struct *vma) { - struct drm_gem_object *obj = filp->private_data; - struct exynos_drm_gem_obj *exynos_gem_obj = to_exynos_gem_obj(obj); - struct drm_device *drm_dev = obj->dev; + struct drm_device *drm_dev = exynos_gem_obj->base.dev; struct exynos_drm_gem_buf *buffer; unsigned long vm_size; int ret; - WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex)); - - vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; - vma->vm_private_data = obj; - vma->vm_ops = drm_dev->driver->gem_vm_ops; - - update_vm_cache_attr(exynos_gem_obj, vma); + vma->vm_flags &= ~VM_PFNMAP; + vma->vm_pgoff = 0; vm_size = vma->vm_end - vma->vm_start; @@ -356,60 +349,6 @@ int exynos_drm_gem_mmap_buffer(struct file *filp, return ret; } - /* - * take a reference to this mapping of the object. And this reference - * is unreferenced by the corresponding vm_close call. - */ - drm_gem_object_reference(obj); - - drm_vm_open_locked(drm_dev, vma); - - return 0; -} - -int exynos_drm_gem_mmap_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv) -{ - struct drm_exynos_file_private *exynos_file_priv; - struct drm_exynos_gem_mmap *args = data; - struct drm_gem_object *obj; - struct file *anon_filp; - unsigned long addr; - - if (!(dev->driver->driver_features & DRIVER_GEM)) { - DRM_ERROR("does not support GEM.\n"); - return -ENODEV; - } - - mutex_lock(&dev->struct_mutex); - - obj = drm_gem_object_lookup(dev, file_priv, args->handle); - if (!obj) { - DRM_ERROR("failed to lookup gem object.\n"); - mutex_unlock(&dev->struct_mutex); - return -EINVAL; - } - - exynos_file_priv = file_priv->driver_priv; - anon_filp = exynos_file_priv->anon_filp; - anon_filp->private_data = obj; - - addr = vm_mmap(anon_filp, 0, args->size, PROT_READ | PROT_WRITE, - MAP_SHARED, 0); - - drm_gem_object_unreference(obj); - - if (IS_ERR_VALUE(addr)) { - mutex_unlock(&dev->struct_mutex); - return (int)addr; - } - - mutex_unlock(&dev->struct_mutex); - - args->mapped = addr; - - DRM_DEBUG_KMS("mapped = 0x%lx\n", (unsigned long)args->mapped); - return 0; } @@ -693,16 +632,20 @@ int exynos_drm_gem_mmap(struct file *filp, struct vm_area_struct *vma) exynos_gem_obj = to_exynos_gem_obj(obj); ret = check_gem_flags(exynos_gem_obj->flags); - if (ret) { - drm_gem_vm_close(vma); - drm_gem_free_mmap_offset(obj); - return ret; - } - - vma->vm_flags &= ~VM_PFNMAP; - vma->vm_flags |= VM_MIXEDMAP; + if (ret) + goto err_close_vm; update_vm_cache_attr(exynos_gem_obj, vma); + ret = exynos_drm_gem_mmap_buffer(exynos_gem_obj, vma); + if (ret) + goto err_close_vm; + + return ret; + +err_close_vm: + drm_gem_vm_close(vma); + drm_gem_free_mmap_offset(obj); + return ret; } diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.h b/drivers/gpu/drm/exynos/exynos_drm_gem.h index 8e460940d118..09d021bbccf5 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.h +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.h @@ -111,16 +111,6 @@ void exynos_drm_gem_put_dma_addr(struct drm_device *dev, unsigned int gem_handle, struct drm_file *filp); -/* - * mmap the physically continuous memory that a gem object contains - * to user space. - */ -int exynos_drm_gem_mmap_ioctl(struct drm_device *dev, void *data, - struct drm_file *file_priv); - -int exynos_drm_gem_mmap_buffer(struct file *filp, - struct vm_area_struct *vma); - /* map user space allocated by malloc to pages. */ int exynos_drm_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv); diff --git a/include/uapi/drm/exynos_drm.h b/include/uapi/drm/exynos_drm.h index 67a751c74a77..5575ed1598bd 100644 --- a/include/uapi/drm/exynos_drm.h +++ b/include/uapi/drm/exynos_drm.h @@ -32,24 +32,6 @@ struct drm_exynos_gem_create { unsigned int handle; }; -/** - * A structure for mapping buffer. - * - * @handle: a handle to gem object created. - * @pad: just padding to be 64-bit aligned. - * @size: memory size to be mapped. - * @mapped: having user virtual address mmaped. - * - this variable would be filled by exynos gem module - * of kernel side with user virtual address which is allocated - * by do_mmap(). - */ -struct drm_exynos_gem_mmap { - unsigned int handle; - unsigned int pad; - uint64_t size; - uint64_t mapped; -}; - /** * A structure to gem information. * @@ -302,7 +284,6 @@ struct drm_exynos_ipp_cmd_ctrl { }; #define DRM_EXYNOS_GEM_CREATE 0x00 -#define DRM_EXYNOS_GEM_MMAP 0x02 /* Reserved 0x03 ~ 0x05 for exynos specific gem ioctl */ #define DRM_EXYNOS_GEM_GET 0x04 #define DRM_EXYNOS_VIDI_CONNECTION 0x07 @@ -321,9 +302,6 @@ struct drm_exynos_ipp_cmd_ctrl { #define DRM_IOCTL_EXYNOS_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + \ DRM_EXYNOS_GEM_CREATE, struct drm_exynos_gem_create) -#define DRM_IOCTL_EXYNOS_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + \ - DRM_EXYNOS_GEM_MMAP, struct drm_exynos_gem_mmap) - #define DRM_IOCTL_EXYNOS_GEM_GET DRM_IOWR(DRM_COMMAND_BASE + \ DRM_EXYNOS_GEM_GET, struct drm_exynos_gem_info) -- cgit v1.2.3 From 23461551c00628c3f3fe9cf837bf53cf8f212b63 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:56 -0700 Subject: fou: Support for foo-over-udp RX path This patch provides a receive path for foo-over-udp. This allows direct encapsulation of IP protocols over UDP. The bound destination port is used to map to an IP protocol, and the XFRM framework (udp_encap_rcv) is used to receive encapsulated packets. Upon reception, the encapsulation header is logically removed (pointer to transport header is advanced) and the packet is reinjected into the receive path with the IP protocol indicated by the mapping. Netlink is used to configure FOU ports. The configuration information includes the port number to bind to and the IP protocol corresponding to that port. This should support GRE/UDP (http://tools.ietf.org/html/draft-yong-tsvwg-gre-in-udp-encap-02), as will as the other IP tunneling protocols (IPIP, SIT). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 32 ++++++ net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/fou.c | 279 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 322 insertions(+) create mode 100644 include/uapi/linux/fou.h create mode 100644 net/ipv4/fou.c (limited to 'include/uapi') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h new file mode 100644 index 000000000000..e03376de453d --- /dev/null +++ b/include/uapi/linux/fou.h @@ -0,0 +1,32 @@ +/* fou.h - FOU Interface */ + +#ifndef _UAPI_LINUX_FOU_H +#define _UAPI_LINUX_FOU_H + +/* NETLINK_GENERIC related info + */ +#define FOU_GENL_NAME "fou" +#define FOU_GENL_VERSION 0x1 + +enum { + FOU_ATTR_UNSPEC, + FOU_ATTR_PORT, /* u16 */ + FOU_ATTR_AF, /* u8 */ + FOU_ATTR_IPPROTO, /* u8 */ + + __FOU_ATTR_MAX, +}; + +#define FOU_ATTR_MAX (__FOU_ATTR_MAX - 1) + +enum { + FOU_CMD_UNSPEC, + FOU_CMD_ADD, + FOU_CMD_DEL, + + __FOU_CMD_MAX, +}; + +#define FOU_CMD_MAX (__FOU_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_FOU_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index dbc10d84161f..84f710b7472a 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -311,6 +311,16 @@ config NET_UDP_TUNNEL tristate default n +config NET_FOU + tristate "IP: Foo (IP protocols) over UDP" + select XFRM + select NET_UDP_TUNNEL + ---help--- + Foo over UDP allows any IP protocol to be directly encapsulated + over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP + network mechanisms and optimizations for UDP (such as ECMP + and RSS) can be leveraged to provide better service. + config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 8ee1cd4053ee..d78d404c596f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o gre-y := gre_demux.o +obj-$(CONFIG_NET_FOU) += fou.o obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c new file mode 100644 index 000000000000..d44f97b79418 --- /dev/null +++ b/net/ipv4/fou.c @@ -0,0 +1,279 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(fou_lock); +static LIST_HEAD(fou_list); + +struct fou { + struct socket *sock; + u8 protocol; + u16 port; + struct list_head list; +}; + +struct fou_cfg { + u8 protocol; + struct udp_port_cfg udp_config; +}; + +static inline struct fou *fou_from_sock(struct sock *sk) +{ + return sk->sk_user_data; +} + +static int fou_udp_encap_recv_deliver(struct sk_buff *skb, + u8 protocol, size_t len) +{ + struct iphdr *iph = ip_hdr(skb); + + /* Remove 'len' bytes from the packet (UDP header and + * FOU header if present), modify the protocol to the one + * we found, and then call rcv_encap. + */ + iph->tot_len = htons(ntohs(iph->tot_len) - len); + __skb_pull(skb, len); + skb_postpull_rcsum(skb, udp_hdr(skb), len); + skb_reset_transport_header(skb); + + return -protocol; +} + +static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + + if (!fou) + return 1; + + return fou_udp_encap_recv_deliver(skb, fou->protocol, + sizeof(struct udphdr)); +} + +static int fou_add_to_port_list(struct fou *fou) +{ + struct fou *fout; + + spin_lock(&fou_lock); + list_for_each_entry(fout, &fou_list, list) { + if (fou->port == fout->port) { + spin_unlock(&fou_lock); + return -EALREADY; + } + } + + list_add(&fou->list, &fou_list); + spin_unlock(&fou_lock); + + return 0; +} + +static void fou_release(struct fou *fou) +{ + struct socket *sock = fou->sock; + struct sock *sk = sock->sk; + + udp_del_offload(&fou->udp_offloads); + + list_del(&fou->list); + + /* Remove hooks into tunnel socket */ + sk->sk_user_data = NULL; + + sock_release(sock); + + kfree(fou); +} + +static int fou_create(struct net *net, struct fou_cfg *cfg, + struct socket **sockp) +{ + struct fou *fou = NULL; + int err; + struct socket *sock = NULL; + struct sock *sk; + + /* Open UDP socket */ + err = udp_sock_create(net, &cfg->udp_config, &sock); + if (err < 0) + goto error; + + /* Allocate FOU port structure */ + fou = kzalloc(sizeof(*fou), GFP_KERNEL); + if (!fou) { + err = -ENOMEM; + goto error; + } + + sk = sock->sk; + + /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ + fou->protocol = cfg->protocol; + fou->port = cfg->udp_config.local_udp_port; + udp_sk(sk)->encap_rcv = fou_udp_recv; + + udp_sk(sk)->encap_type = 1; + udp_encap_enable(); + + sk->sk_user_data = fou; + fou->sock = sock; + + udp_set_convert_csum(sk, true); + + sk->sk_allocation = GFP_ATOMIC; + + err = fou_add_to_port_list(fou); + if (err) + goto error; + + if (sockp) + *sockp = sock; + + return 0; + +error: + kfree(fou); + if (sock) + sock_release(sock); + + return err; +} + +static int fou_destroy(struct net *net, struct fou_cfg *cfg) +{ + struct fou *fou; + u16 port = cfg->udp_config.local_udp_port; + int err = -EINVAL; + + spin_lock(&fou_lock); + list_for_each_entry(fou, &fou_list, list) { + if (fou->port == port) { + fou_release(fou); + err = 0; + break; + } + } + spin_unlock(&fou_lock); + + return err; +} + +static struct genl_family fou_nl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = FOU_GENL_NAME, + .version = FOU_GENL_VERSION, + .maxattr = FOU_ATTR_MAX, + .netnsok = true, +}; + +static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { + [FOU_ATTR_PORT] = { .type = NLA_U16, }, + [FOU_ATTR_AF] = { .type = NLA_U8, }, + [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, +}; + +static int parse_nl_config(struct genl_info *info, + struct fou_cfg *cfg) +{ + memset(cfg, 0, sizeof(*cfg)); + + cfg->udp_config.family = AF_INET; + + if (info->attrs[FOU_ATTR_AF]) { + u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); + + if (family != AF_INET && family != AF_INET6) + return -EINVAL; + + cfg->udp_config.family = family; + } + + if (info->attrs[FOU_ATTR_PORT]) { + u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]); + + cfg->udp_config.local_udp_port = port; + } + + if (info->attrs[FOU_ATTR_IPPROTO]) + cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + + return 0; +} + +static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + int err; + + err = parse_nl_config(info, &cfg); + if (err) + return err; + + return fou_create(&init_net, &cfg, NULL); +} + +static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info) +{ + struct fou_cfg cfg; + + parse_nl_config(info, &cfg); + + return fou_destroy(&init_net, &cfg); +} + +static const struct genl_ops fou_nl_ops[] = { + { + .cmd = FOU_CMD_ADD, + .doit = fou_nl_cmd_add_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = FOU_CMD_DEL, + .doit = fou_nl_cmd_rm_port, + .policy = fou_nl_policy, + .flags = GENL_ADMIN_PERM, + }, +}; + +static int __init fou_init(void) +{ + int ret; + + ret = genl_register_family_with_ops(&fou_nl_family, + fou_nl_ops); + + return ret; +} + +static void __exit fou_fini(void) +{ + struct fou *fou, *next; + + genl_unregister_family(&fou_nl_family); + + /* Close all the FOU sockets */ + + spin_lock(&fou_lock); + list_for_each_entry_safe(fou, next, &fou_list, list) + fou_release(fou); + spin_unlock(&fou_lock); +} + +module_init(fou_init); +module_exit(fou_fini); +MODULE_AUTHOR("Tom Herbert "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 56328486539ddd07cbaafec7a542a2c8a3043623 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:25:58 -0700 Subject: net: Changes to ip_tunnel to support foo-over-udp encapsulation This patch changes IP tunnel to support (secondary) encapsulation, Foo-over-UDP. Changes include: 1) Adding tun_hlen as the tunnel header length, encap_hlen as the encapsulation header length, and hlen becomes the grand total of these. 2) Added common netlink define to support FOU encapsulation. 3) Routines to perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 19 ++++++++- include/uapi/linux/if_tunnel.h | 12 ++++++ net/ipv4/ip_tunnel.c | 91 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 8dd8cab88b87..7f538ba6e267 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_IPV6) @@ -31,6 +32,13 @@ struct ip_tunnel_6rd_parm { }; #endif +struct ip_tunnel_encap { + __u16 type; + __u16 flags; + __be16 sport; + __be16 dport; +}; + struct ip_tunnel_prl_entry { struct ip_tunnel_prl_entry __rcu *next; __be32 addr; @@ -56,13 +64,18 @@ struct ip_tunnel { /* These four fields used only by GRE */ __u32 i_seqno; /* The last seen seqno */ __u32 o_seqno; /* The last output seqno */ - int hlen; /* Precalculated header length */ + int tun_hlen; /* Precalculated header length */ int mlink; struct ip_tunnel_dst __percpu *dst_cache; struct ip_tunnel_parm parms; + int encap_hlen; /* Encap header length (FOU,GUE) */ + struct ip_tunnel_encap encap; + + int hlen; /* tun_hlen + encap_hlen */ + /* for SIT */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm ip6rd; @@ -114,6 +127,8 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, @@ -131,6 +146,8 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); void ip_tunnel_setup(struct net_device *dev, int net_id); void ip_tunnel_dst_reset_all(struct ip_tunnel *t); +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap); /* Extract dsfield from inner protocol */ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 3bce9e9d9f7c..9fedca7d8dae 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -53,10 +53,22 @@ enum { IFLA_IPTUN_6RD_RELAY_PREFIX, IFLA_IPTUN_6RD_PREFIXLEN, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, + IFLA_IPTUN_ENCAP_TYPE, + IFLA_IPTUN_ENCAP_FLAGS, + IFLA_IPTUN_ENCAP_SPORT, + IFLA_IPTUN_ENCAP_DPORT, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) +enum tunnel_encap_types { + TUNNEL_ENCAP_NONE, + TUNNEL_ENCAP_FOU, +}; + +#define TUNNEL_ENCAP_FLAG_CSUM (1<<0) +#define TUNNEL_ENCAP_FLAG_CSUM6 (1<<1) + /* SIT-mode i_flags */ #define SIT_ISATAP 0x0001 diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index afed1aac2638..e3a3dc91e49c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -55,6 +55,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -487,6 +488,91 @@ drop: } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); +static int ip_encap_hlen(struct ip_tunnel_encap *e) +{ + switch (e->type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return sizeof(struct udphdr); + default: + return -EINVAL; + } +} + +int ip_tunnel_encap_setup(struct ip_tunnel *t, + struct ip_tunnel_encap *ipencap) +{ + int hlen; + + memset(&t->encap, 0, sizeof(t->encap)); + + hlen = ip_encap_hlen(ipencap); + if (hlen < 0) + return hlen; + + t->encap.type = ipencap->type; + t->encap.sport = ipencap->sport; + t->encap.dport = ipencap->dport; + t->encap.flags = ipencap->flags; + + t->encap_hlen = hlen; + t->hlen = t->encap_hlen + t->tun_hlen; + + return 0; +} +EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); + +static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + size_t hdr_len, u8 *protocol, struct flowi4 *fl4) +{ + struct udphdr *uh; + __be16 sport; + bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); + int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + skb = iptunnel_handle_offloads(skb, csum, type); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + /* Get length and hash before making space in skb */ + + sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + uh->check = 0; + udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, + fl4->saddr, fl4->daddr, skb->len); + + *protocol = IPPROTO_UDP; + + return 0; +} + +int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, + u8 *protocol, struct flowi4 *fl4) +{ + switch (t->encap.type) { + case TUNNEL_ENCAP_NONE: + return 0; + case TUNNEL_ENCAP_FOU: + return fou_build_header(skb, &t->encap, t->encap_hlen, + protocol, fl4); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(ip_tunnel_encap); + static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df) { @@ -536,7 +622,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, } void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, const u8 protocol) + const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *inner_iph; @@ -617,6 +703,9 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) + goto tx_error; + rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; if (!rt) { -- cgit v1.2.3 From 4565e9919cda747815547e2e5d7b78f15efbffdf Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 17 Sep 2014 12:26:01 -0700 Subject: gre: Setup and TX path for gre/UDP foo-over-udp encapsulation Added netlink attrs to configure FOU encapsulation for GRE, netlink handling of these flags, and properly adjust MTU for encapsulation. ip_tunnel_encap is called from ip_tunnel_xmit to actually perform FOU encapsulation. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 4 ++ net/ipv4/ip_gre.c | 90 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 9fedca7d8dae..7c832afdfa94 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -106,6 +106,10 @@ enum { IFLA_GRE_ENCAP_LIMIT, IFLA_GRE_FLOWINFO, IFLA_GRE_FLAGS, + IFLA_GRE_ENCAP_TYPE, + IFLA_GRE_ENCAP_FLAGS, + IFLA_GRE_ENCAP_SPORT, + IFLA_GRE_ENCAP_DPORT, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9b842544aea3..829aff8bf723 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -239,7 +239,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->hlen); + gre_build_header(skb, &tpi, tunnel->tun_hlen); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } @@ -310,7 +310,7 @@ out: static int ipgre_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { - int err = 0; + int err; struct ip_tunnel_parm p; if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) @@ -470,13 +470,18 @@ static void ipgre_tunnel_setup(struct net_device *dev) static void __gre_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel; + int t_hlen; tunnel = netdev_priv(dev); - tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; - dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + + t_hlen = tunnel->hlen + sizeof(struct iphdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -628,6 +633,40 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->iph.frag_off = htons(IP_DF); } +/* This function returns true when ENCAP attributes are present in the nl msg */ +static bool ipgre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; +} + static int gre_tap_init(struct net_device *dev) { __gre_tunnel_init(dev); @@ -657,6 +696,15 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); @@ -666,6 +714,15 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm p; + struct ip_tunnel_encap ipencap; + + if (ipgre_netlink_encap_parms(data, &ipencap)) { + struct ip_tunnel *t = netdev_priv(dev); + int err = ip_tunnel_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ipgre_netlink_parms(data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); @@ -694,6 +751,14 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_PMTUDISC */ nla_total_size(1) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -714,6 +779,17 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)))) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT, + t->encap.sport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT, + t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.dport)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -731,6 +807,10 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_TTL] = { .type = NLA_U8 }, [IFLA_GRE_TOS] = { .type = NLA_U8 }, [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { -- cgit v1.2.3 From e3d6eb1c16ef174a8fbbdd40770f5cbace0710e4 Mon Sep 17 00:00:00 2001 From: Vincent Palatin Date: Wed, 3 Sep 2014 16:38:39 -0300 Subject: [media] v4l: Add camera pan/tilt speed controls The V4L2_CID_PAN_SPEED and V4L2_CID_TILT_SPEED controls allow to move the camera by setting its rotation speed around its axis. Signed-off-by: Vincent Palatin Reviewed-by: Pawel Osciak Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/compat.xml | 10 ++++++++++ Documentation/DocBook/media/v4l/controls.xml | 21 +++++++++++++++++++++ drivers/media/v4l2-core/v4l2-ctrls.c | 2 ++ include/uapi/linux/v4l2-controls.h | 2 ++ 4 files changed, 35 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/compat.xml b/Documentation/DocBook/media/v4l/compat.xml index eee6f0f4aa43..7aa7c5d41e3b 100644 --- a/Documentation/DocBook/media/v4l/compat.xml +++ b/Documentation/DocBook/media/v4l/compat.xml @@ -2545,6 +2545,16 @@ fields changed from _s32 to _u32. +
+ V4L2 in Linux 3.18 + + + Added V4L2_CID_PAN_SPEED and + V4L2_CID_TILT_SPEED camera controls. + + +
+
Relation of V4L2 to other Linux multimedia APIs diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml index a7eb1bde8b92..e013e4bf244c 100644 --- a/Documentation/DocBook/media/v4l/controls.xml +++ b/Documentation/DocBook/media/v4l/controls.xml @@ -3965,6 +3965,27 @@ by exposure, white balance or focus controls. + + V4L2_CID_PAN_SPEED  + integer + This control turns the +camera horizontally at the specific speed. The unit is undefined. A +positive value moves the camera to the right (clockwise when viewed +from above), a negative value to the left. A value of zero stops the motion +if one is in progress and has no effect otherwise. + + + + + V4L2_CID_TILT_SPEED  + integer + This control turns the +camera vertically at the specified speed. The unit is undefined. A +positive value moves the camera up, a negative value down. A value of zero +stops the motion if one is in progress and has no effect otherwise. + + + diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 35d1f3d5045b..86012140923f 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -796,6 +796,8 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_AUTO_FOCUS_STOP: return "Auto Focus, Stop"; case V4L2_CID_AUTO_FOCUS_STATUS: return "Auto Focus, Status"; case V4L2_CID_AUTO_FOCUS_RANGE: return "Auto Focus, Range"; + case V4L2_CID_PAN_SPEED: return "Pan, Speed"; + case V4L2_CID_TILT_SPEED: return "Tilt, Speed"; /* FM Radio Modulator controls */ /* Keep the order of the 'case's the same as in v4l2-controls.h! */ diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 8b930210a4b9..661f119a51b8 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -746,6 +746,8 @@ enum v4l2_auto_focus_range { V4L2_AUTO_FOCUS_RANGE_INFINITY = 3, }; +#define V4L2_CID_PAN_SPEED (V4L2_CID_CAMERA_CLASS_BASE+32) +#define V4L2_CID_TILT_SPEED (V4L2_CID_CAMERA_CLASS_BASE+33) /* FM Modulator class control IDs */ -- cgit v1.2.3 From fcc0d3db28922f9ba21ea6c7b23ea10ffb5d3521 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 21 Jul 2014 17:06:33 -0300 Subject: [media] v4l: Add ARGB555X and XRGB555X pixel formats The existing RGB555X pixel format is ill-defined in respect to its alpha bit and its meaning is driver dependent. Create new standard ARGB555X and XRGB555X variants with clearly defined meanings and make the existing variant deprecated. The new pixel formats 4CC values have been selected to match the DRM 4CCs for the same in-memory formats. Signed-off-by: Laurent Pinchart Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../DocBook/media/v4l/pixfmt-packed-rgb.xml | 50 ++++++++++++++++++++-- include/uapi/linux/videodev2.h | 3 ++ 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/pixfmt-packed-rgb.xml b/Documentation/DocBook/media/v4l/pixfmt-packed-rgb.xml index 2aae8e9452a4..65a11867e0ae 100644 --- a/Documentation/DocBook/media/v4l/pixfmt-packed-rgb.xml +++ b/Documentation/DocBook/media/v4l/pixfmt-packed-rgb.xml @@ -237,9 +237,9 @@ for a pixel lie next to each other in memory. g4 g3 - - V4L2_PIX_FMT_RGB555X - 'RGBQ' + + V4L2_PIX_FMT_ARGB555X + 'AR15' | (1 << 31) a r4 @@ -259,6 +259,28 @@ for a pixel lie next to each other in memory. b1 b0 + + V4L2_PIX_FMT_XRGB555X + 'XR15' | (1 << 31) + + - + r4 + r3 + r2 + r1 + r0 + g4 + g3 + + g2 + g1 + g0 + b4 + b3 + b2 + b1 + b0 + V4L2_PIX_FMT_RGB565X 'RGBR' @@ -800,6 +822,28 @@ image g4 g3 + + V4L2_PIX_FMT_RGB555X + 'RGBQ' + + a + r4 + r3 + r2 + r1 + r0 + g4 + g3 + + g2 + g1 + g0 + b4 + b3 + b2 + b1 + b0 + V4L2_PIX_FMT_BGR32 'BGR4' diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 0b1ba5c6a8d2..1c2f84fd4d99 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -79,6 +79,7 @@ /* Four-character-code (FOURCC) */ #define v4l2_fourcc(a, b, c, d)\ ((__u32)(a) | ((__u32)(b) << 8) | ((__u32)(c) << 16) | ((__u32)(d) << 24)) +#define v4l2_fourcc_be(a, b, c, d) (v4l2_fourcc(a, b, c, d) | (1 << 31)) /* * E N U M S @@ -307,6 +308,8 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_XRGB555 v4l2_fourcc('X', 'R', '1', '5') /* 16 XRGB-1-5-5-5 */ #define V4L2_PIX_FMT_RGB565 v4l2_fourcc('R', 'G', 'B', 'P') /* 16 RGB-5-6-5 */ #define V4L2_PIX_FMT_RGB555X v4l2_fourcc('R', 'G', 'B', 'Q') /* 16 RGB-5-5-5 BE */ +#define V4L2_PIX_FMT_ARGB555X v4l2_fourcc_be('A', 'R', '1', '5') /* 16 ARGB-5-5-5 BE */ +#define V4L2_PIX_FMT_XRGB555X v4l2_fourcc_be('X', 'R', '1', '5') /* 16 XRGB-5-5-5 BE */ #define V4L2_PIX_FMT_RGB565X v4l2_fourcc('R', 'G', 'B', 'R') /* 16 RGB-5-6-5 BE */ #define V4L2_PIX_FMT_BGR666 v4l2_fourcc('B', 'G', 'R', 'H') /* 18 BGR-6-6-6 */ #define V4L2_PIX_FMT_BGR24 v4l2_fourcc('B', 'G', 'R', '3') /* 24 BGR-8-8-8 */ -- cgit v1.2.3 From 7e51aa4486bcf72daeb5d30227c4c01563f37044 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 2 Apr 2014 16:00:58 -0400 Subject: audit: drop unused struct audit_rule definition The kernel only uses struct audit_rule_data. We dropped support for struct audit_rule a long time ago. Drop the definition in the header file. Signed-off-by: Eric Paris --- include/uapi/linux/audit.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index cf6714752b69..df71b1d2cbb8 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -444,17 +444,4 @@ struct audit_rule_data { char buf[0]; /* string fields buffer */ }; -/* audit_rule is supported to maintain backward compatibility with - * userspace. It supports integer fields only and corresponds to - * AUDIT_ADD, AUDIT_DEL and AUDIT_LIST requests. - */ -struct audit_rule { /* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */ - __u32 flags; /* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */ - __u32 action; /* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */ - __u32 field_count; - __u32 mask[AUDIT_BITMASK_SIZE]; - __u32 fields[AUDIT_MAX_FIELDS]; - __u32 values[AUDIT_MAX_FIELDS]; -}; - #endif /* _UAPI_LINUX_AUDIT_H_ */ -- cgit v1.2.3 From ce5d112827e5c2e9864323d0efd7ec2a62c6dce0 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 11 Mar 2014 13:50:46 -0400 Subject: ARCH: AUDIT: implement syscall_get_arch for all arches For all arches which support audit implement syscall_get_arch() They are all pretty easy and straight forward, stolen from how the call to audit_syscall_entry() determines the arch. Based-on-patch-by: Richard Briggs Signed-off-by: Eric Paris Cc: linux-ia64@vger.kernel.org Cc: microblaze-uclinux@itee.uq.edu.au Cc: linux-mips@linux-mips.org Cc: linux@lists.openrisc.net Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org --- arch/ia64/include/asm/syscall.h | 6 ++++++ arch/microblaze/include/asm/syscall.h | 5 +++++ arch/mips/include/asm/syscall.h | 2 +- arch/openrisc/include/asm/syscall.h | 5 +++++ arch/parisc/include/asm/syscall.h | 11 +++++++++++ arch/powerpc/include/asm/syscall.h | 12 ++++++++++++ arch/sparc/include/asm/syscall.h | 8 ++++++++ include/uapi/linux/audit.h | 1 + 8 files changed, 49 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/arch/ia64/include/asm/syscall.h b/arch/ia64/include/asm/syscall.h index a7ff1c6ab068..1d0b875fec44 100644 --- a/arch/ia64/include/asm/syscall.h +++ b/arch/ia64/include/asm/syscall.h @@ -13,6 +13,7 @@ #ifndef _ASM_SYSCALL_H #define _ASM_SYSCALL_H 1 +#include #include #include @@ -79,4 +80,9 @@ static inline void syscall_set_arguments(struct task_struct *task, ia64_syscall_get_set_arguments(task, regs, i, n, args, 1); } + +static inline int syscall_get_arch(void) +{ + return AUDIT_ARCH_IA64; +} #endif /* _ASM_SYSCALL_H */ diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h index 9bc431783105..53cfaf34c343 100644 --- a/arch/microblaze/include/asm/syscall.h +++ b/arch/microblaze/include/asm/syscall.h @@ -1,6 +1,7 @@ #ifndef __ASM_MICROBLAZE_SYSCALL_H #define __ASM_MICROBLAZE_SYSCALL_H +#include #include #include #include @@ -99,4 +100,8 @@ static inline void syscall_set_arguments(struct task_struct *task, asmlinkage long do_syscall_trace_enter(struct pt_regs *regs); asmlinkage void do_syscall_trace_leave(struct pt_regs *regs); +static inline int syscall_get_arch(void) +{ + return AUDIT_ARCH_MICROBLAZE; +} #endif /* __ASM_MICROBLAZE_SYSCALL_H */ diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 17960fe7a8ce..93b3b86c293c 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -129,7 +129,7 @@ extern const unsigned long sysn32_call_table[]; static inline int syscall_get_arch(void) { - int arch = EM_MIPS; + int arch = AUDIT_ARCH_MIPS; #ifdef CONFIG_64BIT if (!test_thread_flag(TIF_32BIT_REGS)) arch |= __AUDIT_ARCH_64BIT; diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h index b752bb67891d..2db9f1cf0694 100644 --- a/arch/openrisc/include/asm/syscall.h +++ b/arch/openrisc/include/asm/syscall.h @@ -19,6 +19,7 @@ #ifndef __ASM_OPENRISC_SYSCALL_H__ #define __ASM_OPENRISC_SYSCALL_H__ +#include #include #include @@ -71,4 +72,8 @@ syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, memcpy(®s->gpr[3 + i], args, n * sizeof(args[0])); } +static inline int syscall_get_arch(void) +{ + return AUDIT_ARCH_OPENRISC; +} #endif diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h index 8bdfd2c8c39f..a5eba95d87fe 100644 --- a/arch/parisc/include/asm/syscall.h +++ b/arch/parisc/include/asm/syscall.h @@ -3,6 +3,8 @@ #ifndef _ASM_PARISC_SYSCALL_H_ #define _ASM_PARISC_SYSCALL_H_ +#include +#include #include #include @@ -37,4 +39,13 @@ static inline void syscall_get_arguments(struct task_struct *tsk, } } +static inline int syscall_get_arch(void) +{ + int arch = AUDIT_ARCH_PARISC; +#ifdef CONFIG_64BIT + if (!is_compat_task()) + arch = AUDIT_ARCH_PARISC64; +#endif + return arch; +} #endif /*_ASM_PARISC_SYSCALL_H_*/ diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index b54b2add07be..427154444f6d 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -13,6 +13,8 @@ #ifndef _ASM_SYSCALL_H #define _ASM_SYSCALL_H 1 +#include +#include #include /* ftrace syscalls requires exporting the sys_call_table */ @@ -86,4 +88,14 @@ static inline void syscall_set_arguments(struct task_struct *task, memcpy(®s->gpr[3 + i], args, n * sizeof(args[0])); } +static inline int syscall_get_arch(void) +{ + int arch = AUDIT_ARCH_PPC; + +#ifdef CONFIG_PPC64 + if (!is_32bit_task()) + arch = AUDIT_ARCH_PPC64; +#endif + return arch; +} #endif /* _ASM_SYSCALL_H */ diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 025a02ad2e31..fed3d511b108 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -1,9 +1,11 @@ #ifndef __ASM_SPARC_SYSCALL_H #define __ASM_SPARC_SYSCALL_H +#include #include #include #include +#include /* * The syscall table always contains 32 bit pointers since we know that the @@ -124,4 +126,10 @@ static inline void syscall_set_arguments(struct task_struct *task, regs->u_regs[UREG_I0 + i + j] = args[j]; } +static inline int syscall_get_arch(void) +{ + return test_thread_flag(TIF_32BIT) ? AUDIT_ARCH_SPARC + : AUDIT_ARCH_SPARC64; +} + #endif /* __ASM_SPARC_SYSCALL_H */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index df71b1d2cbb8..4d100c841c80 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -351,6 +351,7 @@ enum { #define AUDIT_ARCH_IA64 (EM_IA_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) #define AUDIT_ARCH_M32R (EM_M32R) #define AUDIT_ARCH_M68K (EM_68K) +#define AUDIT_ARCH_MICROBLAZE (EM_MICROBLAZE) #define AUDIT_ARCH_MIPS (EM_MIPS) #define AUDIT_ARCH_MIPSEL (EM_MIPS|__AUDIT_ARCH_LE) #define AUDIT_ARCH_MIPS64 (EM_MIPS|__AUDIT_ARCH_64BIT) -- cgit v1.2.3 From 26d8f6f15112b8b0fbff360c360e8c42bf2bc370 Mon Sep 17 00:00:00 2001 From: Frank Haverkamp Date: Wed, 10 Sep 2014 16:37:48 +0200 Subject: GenWQE: Update author information Updated email address of co-author. Signed-off-by: Frank Haverkamp Signed-off-by: Michael Jung Signed-off-by: Greg Kroah-Hartman --- drivers/misc/genwqe/card_base.c | 4 ++-- drivers/misc/genwqe/card_base.h | 2 +- drivers/misc/genwqe/card_ddcb.c | 2 +- drivers/misc/genwqe/card_ddcb.h | 2 +- drivers/misc/genwqe/card_debugfs.c | 2 +- drivers/misc/genwqe/card_dev.c | 2 +- drivers/misc/genwqe/card_sysfs.c | 2 +- drivers/misc/genwqe/card_utils.c | 2 +- drivers/misc/genwqe/genwqe_driver.h | 2 +- include/uapi/linux/genwqe/genwqe_card.h | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c index c60ad4bd7ed6..12926c8c0609 100644 --- a/drivers/misc/genwqe/card_base.c +++ b/drivers/misc/genwqe/card_base.c @@ -5,7 +5,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify @@ -45,7 +45,7 @@ MODULE_AUTHOR("Frank Haverkamp "); MODULE_AUTHOR("Michael Ruettger "); MODULE_AUTHOR("Joerg-Stephan Vogt "); -MODULE_AUTHOR("Michal Jung "); +MODULE_AUTHOR("Michael Jung "); MODULE_DESCRIPTION("GenWQE Card"); MODULE_VERSION(DRV_VERSION); diff --git a/drivers/misc/genwqe/card_base.h b/drivers/misc/genwqe/card_base.h index 37657d6228de..b622d204b72b 100644 --- a/drivers/misc/genwqe/card_base.h +++ b/drivers/misc/genwqe/card_base.h @@ -8,7 +8,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/card_ddcb.c b/drivers/misc/genwqe/card_ddcb.c index 1102e1ef8104..75ff67485fdf 100644 --- a/drivers/misc/genwqe/card_ddcb.c +++ b/drivers/misc/genwqe/card_ddcb.c @@ -5,7 +5,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/card_ddcb.h b/drivers/misc/genwqe/card_ddcb.h index c4f26720753e..0361a68d79a6 100644 --- a/drivers/misc/genwqe/card_ddcb.h +++ b/drivers/misc/genwqe/card_ddcb.h @@ -8,7 +8,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/card_debugfs.c b/drivers/misc/genwqe/card_debugfs.c index 170a7915497c..40b425d35866 100644 --- a/drivers/misc/genwqe/card_debugfs.c +++ b/drivers/misc/genwqe/card_debugfs.c @@ -5,7 +5,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c index aae42555e2ca..80fdde2873de 100644 --- a/drivers/misc/genwqe/card_dev.c +++ b/drivers/misc/genwqe/card_dev.c @@ -5,7 +5,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/card_sysfs.c b/drivers/misc/genwqe/card_sysfs.c index 1d1c6069ebd7..2c33fbca9225 100644 --- a/drivers/misc/genwqe/card_sysfs.c +++ b/drivers/misc/genwqe/card_sysfs.c @@ -5,7 +5,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index a6400f09229c..349b342c369b 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -5,7 +5,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/misc/genwqe/genwqe_driver.h b/drivers/misc/genwqe/genwqe_driver.h index e39d81fd93c4..15355350e076 100644 --- a/drivers/misc/genwqe/genwqe_driver.h +++ b/drivers/misc/genwqe/genwqe_driver.h @@ -8,7 +8,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify diff --git a/include/uapi/linux/genwqe/genwqe_card.h b/include/uapi/linux/genwqe/genwqe_card.h index 4fc065f29255..baa93fb4cd4f 100644 --- a/include/uapi/linux/genwqe/genwqe_card.h +++ b/include/uapi/linux/genwqe/genwqe_card.h @@ -8,7 +8,7 @@ * * Author: Frank Haverkamp * Author: Joerg-Stephan Vogt - * Author: Michael Jung + * Author: Michael Jung * Author: Michael Ruettger * * This program is free software; you can redistribute it and/or modify -- cgit v1.2.3 From bc5a5b02331a3175a5fca20a4beba249e573b672 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Tue, 2 Sep 2014 19:21:47 -0700 Subject: Drivers: hv: util: Properly pack the data for file copy functionality Properly pack the data for file copy functionality. Patch based on investigation done by Matej Muzila Signed-off-by: K. Y. Srinivasan Reported-by: Cc: Acked-by: Jason Wang Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/hyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 78e4a86030dd..0a8e6badb29b 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -137,7 +137,7 @@ struct hv_do_fcopy { __u64 offset; __u32 size; __u8 data[DATA_FRAGMENT]; -}; +} __attribute__((packed)); /* * An implementation of HyperV key value pair (KVP) functionality for Linux. -- cgit v1.2.3 From 846fc70986a65563a19ae86928c3acf34f12296d Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 13 Aug 2014 02:22:40 -0400 Subject: PCI/AER: Rename PCI_ERR_UNC_TRAIN to PCI_ERR_UNC_UND In PCIe r1.0, sec 5.10.2, bit 0 of the Uncorrectable Error Status, Mask, and Severity Registers was for "Training Error." In PCIe r1.1, sec 7.10.2, bit 0 was redefined to be "Undefined." Rename PCI_ERR_UNC_TRAIN to PCI_ERR_UNC_UND to reflect this change. No functional change. [bhelgaas: changelog] Signed-off-by: Chen, Gong Signed-off-by: Bjorn Helgaas --- drivers/vfio/pci/vfio_pci_config.c | 2 +- include/ras/ras_event.h | 2 +- include/uapi/linux/pci_regs.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index e50790e91f76..1de3f94aa7de 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -727,7 +727,7 @@ static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm) p_setd(perm, 0, ALL_VIRT, NO_WRITE); /* Writable bits mask */ - mask = PCI_ERR_UNC_TRAIN | /* Training */ + mask = PCI_ERR_UNC_UND | /* Undefined */ PCI_ERR_UNC_DLP | /* Data Link Protocol */ PCI_ERR_UNC_SURPDN | /* Surprise Down */ PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 0f04a9755d1e..79abb9c71772 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -185,7 +185,7 @@ TRACE_EVENT(mc_event, {PCI_ERR_COR_LOG_OVER, "Header Log Overflow"} #define aer_uncorrectable_errors \ - {PCI_ERR_UNC_TRAIN, "Undefined"}, \ + {PCI_ERR_UNC_UND, "Undefined"}, \ {PCI_ERR_UNC_DLP, "Data Link Protocol Error"}, \ {PCI_ERR_UNC_SURPDN, "Surprise Down Error"}, \ {PCI_ERR_UNC_POISON_TLP,"Poisoned TLP"}, \ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 30db069bce62..99e3182f2c96 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -630,7 +630,7 @@ /* Advanced Error Reporting */ #define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ -#define PCI_ERR_UNC_TRAIN 0x00000001 /* Training */ +#define PCI_ERR_UNC_UND 0x00000001 /* Undefined */ #define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */ #define PCI_ERR_UNC_SURPDN 0x00000020 /* Surprise Down */ #define PCI_ERR_UNC_POISON_TLP 0x00001000 /* Poisoned TLP */ -- cgit v1.2.3 From 5c2cacc1028917168b0f7650008dceaa6f7e3fe2 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 24 Sep 2014 09:47:27 -0300 Subject: [media] v4l2-dv-timings: fix a sparse warning This is detected with: gcc-4.8.3-7.fc20.x86_64 sparse-0.5.0-3.fc20.x86_64 drivers/media/v4l2-core/v4l2-dv-timings.c:34:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:35:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:36:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:37:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:38:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:39:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:40:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:41:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:42:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:43:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:44:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:45:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:46:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:47:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:48:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:49:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:50:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:51:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:52:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:53:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:54:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:55:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:56:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:57:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:58:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:59:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:60:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:61:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:62:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:63:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:64:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:65:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:66:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:67:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:68:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:69:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:70:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:71:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:72:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:73:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:74:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:75:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:76:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:77:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:78:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:79:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:80:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:81:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:82:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:83:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:84:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:85:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:86:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:87:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:88:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:89:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:90:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:91:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:92:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:93:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:94:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:95:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:96:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:97:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:98:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:99:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:100:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:101:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:102:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:103:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:104:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:105:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:106:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:107:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:108:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:109:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:110:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:111:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:112:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:113:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:114:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:115:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:116:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:117:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:118:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:119:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:120:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:121:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:122:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:123:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:124:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:125:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:126:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:127:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:128:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:129:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:130:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:131:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:132:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:133:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:134:9: error: unknown field name in initializer drivers/media/v4l2-core/v4l2-dv-timings.c:135:9: error: too many errors drivers/media/usb/hdpvr/hdpvr-video.c:42:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:43:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:44:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:45:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:46:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:47:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:48:9: error: unknown field name in initializer drivers/media/usb/hdpvr/hdpvr-video.c:49:9: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:484:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:485:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:486:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:487:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:488:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:489:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:490:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:491:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:492:18: error: unknown field name in initializer drivers/media/platform/s5p-tv/hdmi_drv.c:493:18: error: unknown field name in initializer Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-dv-timings.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 6c8f159e416e..6a0764c89fcb 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -21,17 +21,8 @@ #ifndef _V4L2_DV_TIMINGS_H #define _V4L2_DV_TIMINGS_H -#if __GNUC__ < 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ < 6)) -/* Sadly gcc versions older than 4.6 have a bug in how they initialize - anonymous unions where they require additional curly brackets. - This violates the C1x standard. This workaround adds the curly brackets - if needed. */ #define V4L2_INIT_BT_TIMINGS(_width, args...) \ { .bt = { _width , ## args } } -#else -#define V4L2_INIT_BT_TIMINGS(_width, args...) \ - .bt = { _width , ## args } -#endif /* CEA-861-E timings (i.e. standard HDTV timings) */ -- cgit v1.2.3 From 29075feaf1f55e6b1aa4054b44bc141e8d5eab0b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 26 Sep 2014 09:05:39 -0700 Subject: next: openrisc: Fix build openrisc:defconfig fails to build in next-20140926 with the following error. In file included from arch/openrisc/kernel/signal.c:31:0: ./arch/openrisc/include/asm/syscall.h: In function 'syscall_get_arch': ./arch/openrisc/include/asm/syscall.h:77:9: error: 'EM_OPENRISC' undeclared Fix by moving EM_OPENRISC to include/uapi/linux/elf-em.h. Fixes: ce5d112827e5 ("ARCH: AUDIT: implement syscall_get_arch for all arches") Cc: Eric Paris Cc: Stefan Kristiansson Cc: Geert Uytterhoeven Cc: Stephen Rothwell Signed-off-by: Guenter Roeck Signed-off-by: Eric Paris --- arch/openrisc/include/uapi/asm/elf.h | 3 +-- include/uapi/linux/elf-em.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/arch/openrisc/include/uapi/asm/elf.h b/arch/openrisc/include/uapi/asm/elf.h index f02ea5830420..88842760e66f 100644 --- a/arch/openrisc/include/uapi/asm/elf.h +++ b/arch/openrisc/include/uapi/asm/elf.h @@ -55,9 +55,8 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG]; /* A placeholder; OR32 does not have fp support yes, so no fp regs for now. */ typedef unsigned long elf_fpregset_t; -/* This should be moved to include/linux/elf.h */ +/* EM_OPENRISC is defined in linux/elf-em.h */ #define EM_OR32 0x8472 -#define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */ /* * These are used to set parameters in the core dumps. diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h index 01529bd96438..aa90bc98b6e2 100644 --- a/include/uapi/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h @@ -32,6 +32,7 @@ #define EM_V850 87 /* NEC v850 */ #define EM_M32R 88 /* Renesas M32R */ #define EM_MN10300 89 /* Panasonic/MEI MN10300, AM33 */ +#define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */ #define EM_BLACKFIN 106 /* ADI Blackfin Processor */ #define EM_TI_C6000 140 /* TI C6X DSPs */ #define EM_AARCH64 183 /* ARM 64 bit */ -- cgit v1.2.3 From 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:57 -0700 Subject: bpf: introduce BPF syscall and maps BPF syscall is a multiplexor for a range of different operations on eBPF. This patch introduces syscall with single command to create a map. Next patch adds commands to access maps. 'maps' is a generic storage of different types for sharing data between kernel and userspace. Userspace example: /* this syscall wrapper creates a map with given type and attributes * and returns map_fd on success. * use close(map_fd) to delete the map */ int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries) { union bpf_attr attr = { .map_type = map_type, .key_size = key_size, .value_size = value_size, .max_entries = max_entries }; return bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } 'union bpf_attr' is backwards compatible with future extensions. More details in Documentation/networking/filter.txt and in manpage Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 39 +++++++++ include/linux/bpf.h | 41 +++++++++ include/uapi/linux/bpf.h | 23 +++++ kernel/bpf/Makefile | 2 +- kernel/bpf/syscall.c | 169 ++++++++++++++++++++++++++++++++++++ 5 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf.h create mode 100644 kernel/bpf/syscall.c (limited to 'include/uapi') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 014e0319a5c4..4a01d71785e9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -1001,6 +1001,45 @@ instruction that loads 64-bit immediate value into a dst_reg. Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM which loads 32-bit immediate value into a register. +eBPF maps +--------- +'maps' is a generic storage of different types for sharing data between kernel +and userspace. + +The maps are accessed from user space via BPF syscall, which has commands: +- create a map with given type and attributes + map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) + using attr->map_type, attr->key_size, attr->value_size, attr->max_entries + returns process-local file descriptor or negative error + +- lookup key in a given map + err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero and stores found elem into value or negative error + +- create or update key/value pair in a given map + err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key, attr->value + returns zero or negative error + +- find and delete element by key in a given map + err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + using attr->map_fd, attr->key + +- to delete map: close(fd) + Exiting process will delete maps automatically + +userspace programs use this syscall to create/access maps that eBPF programs +are concurrently updating. + +maps can have different types: hash, array, bloom filter, radix-tree, etc. + +The map is defined by: + . type + . max number of elements + . key size in bytes + . value size in bytes + Testing ------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h new file mode 100644 index 000000000000..48014a71f0fe --- /dev/null +++ b/include/linux/bpf.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _LINUX_BPF_H +#define _LINUX_BPF_H 1 + +#include +#include + +struct bpf_map; + +/* map is generic key/value storage optionally accesible by eBPF programs */ +struct bpf_map_ops { + /* funcs callable from userspace (via syscall) */ + struct bpf_map *(*map_alloc)(union bpf_attr *attr); + void (*map_free)(struct bpf_map *); +}; + +struct bpf_map { + atomic_t refcnt; + enum bpf_map_type map_type; + u32 key_size; + u32 value_size; + u32 max_entries; + struct bpf_map_ops *ops; + struct work_struct work; +}; + +struct bpf_map_type_list { + struct list_head list_node; + struct bpf_map_ops *ops; + enum bpf_map_type type; +}; + +void bpf_register_map_type(struct bpf_map_type_list *tl); +void bpf_map_put(struct bpf_map *map); + +#endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 479ed0b6be16..f58a10f9670c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -62,4 +62,27 @@ struct bpf_insn { __s32 imm; /* signed immediate constant */ }; +/* BPF syscall commands */ +enum bpf_cmd { + /* create a map with given type and attributes + * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size) + * returns fd or negative error + * map is deleted when fd is closed + */ + BPF_MAP_CREATE, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, +}; + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + }; +} __attribute__((aligned(8))); + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a71145e2769..e9f7334ed07a 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1 +1 @@ -obj-y := core.o +obj-y := core.o syscall.o diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000000..428a0e23adc0 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,169 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} -- cgit v1.2.3 From 749730ce42a2121e1c88350d69478bff3994b10a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:58 -0700 Subject: bpf: enable bpf syscall on x64 and i386 done as separate commit to ease conflict resolution Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 3 ++- include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 3 +++ 5 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 028b78168d85..9fe1b5d002f0 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -363,3 +363,4 @@ 354 i386 seccomp sys_seccomp 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create +357 i386 bpf sys_bpf diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 35dd922727b9..281150b539a2 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -327,6 +327,7 @@ 318 common getrandom sys_getrandom 319 common memfd_create sys_memfd_create 320 common kexec_file_load sys_kexec_file_load +321 common bpf sys_bpf # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 0f86d85a9ce4..bda9b81357cc 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -65,6 +65,7 @@ struct old_linux_dirent; struct perf_event_attr; struct file_handle; struct sigaltstack; +union bpf_attr; #include #include @@ -875,5 +876,5 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, const char __user *uargs); asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); - +asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 11d11bc5c78f..22749c134117 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -705,9 +705,11 @@ __SYSCALL(__NR_seccomp, sys_seccomp) __SYSCALL(__NR_getrandom, sys_getrandom) #define __NR_memfd_create 279 __SYSCALL(__NR_memfd_create, sys_memfd_create) +#define __NR_bpf 280 +__SYSCALL(__NR_bpf, sys_bpf) #undef __NR_syscalls -#define __NR_syscalls 280 +#define __NR_syscalls 281 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 391d4ddb6f4b..b4b5083f5f5e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -218,3 +218,6 @@ cond_syscall(sys_kcmp); /* operate on Secure Computing state */ cond_syscall(sys_seccomp); + +/* access BPF programs and maps */ +cond_syscall(sys_bpf); -- cgit v1.2.3 From db20fd2b01087bdfbe30bce314a198eefedcc42e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:16:59 -0700 Subject: bpf: add lookup/update/delete/iterate methods to BPF maps 'maps' is a generic storage of different types for sharing data between kernel and userspace. The maps are accessed from user space via BPF syscall, which has commands: - create a map with given type and attributes fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size) returns fd or negative error - lookup key in a given map referenced by fd err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero and stores found elem into value or negative error - create or update key/value pair in a given map err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->value returns zero or negative error - find and delete element by key in a given map err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key - iterate map elements (based on input key return next_key) err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) using attr->map_fd, attr->key, attr->next_key - close(fd) deletes the map Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 8 ++ include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/syscall.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48014a71f0fe..2887f3f9da59 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -9,6 +9,7 @@ #include #include +#include struct bpf_map; @@ -17,6 +18,12 @@ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_free)(struct bpf_map *); + int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + + /* funcs callable from userspace and from eBPF programs */ + void *(*map_lookup_elem)(struct bpf_map *map, void *key); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_delete_elem)(struct bpf_map *map, void *key); }; struct bpf_map { @@ -37,5 +44,6 @@ struct bpf_map_type_list { void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); +struct bpf_map *bpf_map_get(struct fd f); #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f58a10f9670c..395cabd2ca0a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -70,6 +70,35 @@ enum bpf_cmd { * map is deleted when fd is closed */ BPF_MAP_CREATE, + + /* lookup key in a given map + * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero and stores found elem into value + * or negative error + */ + BPF_MAP_LOOKUP_ELEM, + + /* create or update key/value pair in a given map + * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->value + * returns zero or negative error + */ + BPF_MAP_UPDATE_ELEM, + + /* find and delete elem by key in a given map + * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key + * returns zero or negative error + */ + BPF_MAP_DELETE_ELEM, + + /* lookup key in a given map and return next key + * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size) + * Using attr->map_fd, attr->key, attr->next_key + * returns zero and stores next key or negative error + */ + BPF_MAP_GET_NEXT_KEY, }; enum bpf_map_type { @@ -83,6 +112,15 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + }; } __attribute__((aligned(8))); #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 428a0e23adc0..f94349ecaf61 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -13,6 +13,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -111,6 +112,228 @@ free_map: return err; } +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ESRCH; + rcu_read_lock(); + value = map->ops->map_lookup_elem(map, key); + if (!value) + goto err_unlock; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto err_unlock; + + err = 0; + +err_unlock: + rcu_read_unlock(); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_CREATE: err = map_create(&attr); break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:00 -0700 Subject: bpf: expand BPF syscall with program load/unload eBPF programs are similar to kernel modules. They are loaded by the user process and automatically unloaded when process exits. Each eBPF program is a safe run-to-completion set of instructions. eBPF verifier statically determines that the program terminates and is safe to execute. The following syscall wrapper can be used to load the program: int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int insn_cnt, const char *license) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64(insns), .insn_cnt = insn_cnt, .license = ptr_to_u64(license), }; return bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } where 'insns' is an array of eBPF instructions and 'license' is a string that must be GPL compatible to call helper functions marked gpl_only Upon succesful load the syscall returns prog_fd. Use close(prog_fd) to unload the program. User space tests and examples follow in the later patches Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 38 +++++++++++ include/linux/filter.h | 8 +-- include/uapi/linux/bpf.h | 26 ++++++++ kernel/bpf/core.c | 29 +++++---- kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 246 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2887f3f9da59..92979182be81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -46,4 +46,42 @@ void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_map_put(struct bpf_map *map); struct bpf_map *bpf_map_get(struct fd f); +/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs + * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL + * instructions after verifying + */ +struct bpf_func_proto { + u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); + bool gpl_only; +}; + +struct bpf_verifier_ops { + /* return eBPF function prototype for verification */ + const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +}; + +struct bpf_prog_type_list { + struct list_head list_node; + struct bpf_verifier_ops *ops; + enum bpf_prog_type type; +}; + +void bpf_register_prog_type(struct bpf_prog_type_list *tl); + +struct bpf_prog; + +struct bpf_prog_aux { + atomic_t refcnt; + bool is_gpl_compatible; + enum bpf_prog_type prog_type; + struct bpf_verifier_ops *ops; + struct bpf_map **used_maps; + u32 used_map_cnt; + struct bpf_prog *prog; + struct work_struct work; +}; + +void bpf_prog_put(struct bpf_prog *prog); +struct bpf_prog *bpf_prog_get(u32 ufd); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 1a0bc6d134d7..4ffc0958d85e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,6 +21,7 @@ struct sk_buff; struct sock; struct seccomp_data; +struct bpf_prog_aux; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -300,17 +301,12 @@ struct bpf_binary_header { u8 image[]; }; -struct bpf_work_struct { - struct bpf_prog *prog; - struct work_struct work; -}; - struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ u32 len; /* Number of filter blocks */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ - struct bpf_work_struct *work; /* Deferred free work struct */ + struct bpf_prog_aux *aux; /* Auxiliary fields */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395cabd2ca0a..424f442016e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -99,12 +99,23 @@ enum bpf_cmd { * returns zero and stores next key or negative error */ BPF_MAP_GET_NEXT_KEY, + + /* verify and load eBPF program + * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size) + * Using attr->prog_type, attr->insns, attr->license + * returns fd or negative error + */ + BPF_PROG_LOAD, }; enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, }; +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, +}; + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -121,6 +132,21 @@ union bpf_attr { __aligned_u64 next_key; }; }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + }; } __attribute__((aligned(8))); +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +enum bpf_func_id { + BPF_FUNC_unspec, + __BPF_FUNC_MAX_ID, +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8b7002488251..f0c30c59b317 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include #include #include +#include /* Registers */ #define BPF_R0 regs[BPF_REG_0] @@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | gfp_extra_flags; - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); @@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags); - if (ws == NULL) { + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { vfree(fp); return NULL; } fp->pages = size / PAGE_SIZE; - fp->work = ws; + fp->aux = aux; return fp; } @@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = size / PAGE_SIZE; - /* We keep fp->work from fp_old around in the new + /* We keep fp->aux from fp_old around in the new * reallocated structure. */ - fp_old->work = NULL; + fp_old->aux = NULL; __bpf_prog_free(fp_old); } @@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc); void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->work); + kfree(fp->aux); vfree(fp); } EXPORT_SYMBOL_GPL(__bpf_prog_free); @@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static void bpf_prog_free_deferred(struct work_struct *work) { - struct bpf_work_struct *ws; + struct bpf_prog_aux *aux; - ws = container_of(work, struct bpf_work_struct, work); - bpf_jit_free(ws->prog); + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); } /* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { - struct bpf_work_struct *ws = fp->work; + struct bpf_prog_aux *aux = fp->aux; - INIT_WORK(&ws->work, bpf_prog_free_deferred); - ws->prog = fp; - schedule_work(&ws->work); + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f94349ecaf61..0afb4eaa1887 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static LIST_HEAD(bpf_map_types); @@ -334,6 +336,166 @@ err_put: return err; } +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->aux->prog_type = type; + return 0; + } + } + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD license + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->aux->is_gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + /* err = bpf_check(prog, tb); */ + + if (err < 0) + goto free_used_maps; + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From cbd357008604925355ae7b54a09137dabb81b580 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 26 Sep 2014 00:17:03 -0700 Subject: bpf: verifier (add ability to receive verification log) add optional attributes for BPF_PROG_LOAD syscall: union bpf_attr { struct { ... __u32 log_level; /* verbosity level of eBPF verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied 'char *buffer' */ }; }; when log_level > 0 the verifier will return its verification log in the user supplied buffer 'log_buf' which can be used by program author to analyze why verifier rejected given program. 'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt provides several examples of these messages, like the program: BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), will be rejected with the following multi-line message in log_buf: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 0 4: (85) call 1 5: (15) if r0 == 0x0 goto pc+1 R0=map_ptr R10=fp 6: (7a) *(u64 *)(r0 +4) = 0 misaligned access off 4 size 8 The format of the output can change at any time as verifier evolves. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 + kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 424f442016e7..31b0ac208a52 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -138,6 +138,9 @@ union bpf_attr { __u32 insn_cnt; __aligned_u64 insns; __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 74b3628c5fdb..ba61c8c16032 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD license +#define BPF_PROG_LOAD_LAST_FIELD log_buf static int bpf_prog_load(union bpf_attr *attr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d6f9c3d6b4d7..871edc1f2e1f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -125,9 +125,244 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) { + char __user *log_ubuf = NULL; + struct verifier_env *env; int ret = -EINVAL; + if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + /* ret = do_check(env); */ + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + kfree(env); + mutex_unlock(&bpf_verifier_lock); return ret; } -- cgit v1.2.3 From e3118e8359bb7c59555aca60c725106e6d78c5ce Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2014 22:37:36 +0200 Subject: net: tcp: add DCTCP congestion control algorithm This work adds the DataCenter TCP (DCTCP) congestion control algorithm [1], which has been first published at SIGCOMM 2010 [2], resp. follow-up analysis at SIGMETRICS 2011 [3] (and also, more recently as an informational IETF draft available at [4]). DCTCP is an enhancement to the TCP congestion control algorithm for data center networks. Typical data center workloads are i.e. i) partition/aggregate (queries; bursty, delay sensitive), ii) short messages e.g. 50KB-1MB (for coordination and control state; delay sensitive), and iii) large flows e.g. 1MB-100MB (data update; throughput sensitive). DCTCP has therefore been designed for such environments to provide/achieve the following three requirements: * High burst tolerance (incast due to partition/aggregate) * Low latency (short flows, queries) * High throughput (continuous data updates, large file transfers) with commodity, shallow buffered switches The basic idea of its design consists of two fundamentals: i) on the switch side, packets are being marked when its internal queue length > threshold K (K is chosen so that a large enough headroom for marked traffic is still available in the switch queue); ii) the sender/host side maintains a moving average of the fraction of marked packets, so each RTT, F is being updated as follows: F := X / Y, where X is # of marked ACKs, Y is total # of ACKs alpha := (1 - g) * alpha + g * F, where g is a smoothing constant The resulting alpha (iow: probability that switch queue is congested) is then being used in order to adaptively decrease the congestion window W: W := (1 - (alpha / 2)) * W The means for receiving marked packets resp. marking them on switch side in DCTCP is the use of ECN. RFC3168 describes a mechanism for using Explicit Congestion Notification from the switch for early detection of congestion, rather than waiting for segment loss to occur. However, this method only detects the presence of congestion, not the *extent*. In the presence of mild congestion, it reduces the TCP congestion window too aggressively and unnecessarily affects the throughput of long flows [4]. DCTCP, as mentioned, enhances Explicit Congestion Notification (ECN) processing to estimate the fraction of bytes that encounter congestion, rather than simply detecting that some congestion has occurred. DCTCP then scales the TCP congestion window based on this estimate [4], thus it can derive multibit feedback from the information present in the single-bit sequence of marks in its control law. And thus act in *proportion* to the extent of congestion, not its *presence*. Switches therefore set the Congestion Experienced (CE) codepoint in packets when internal queue lengths exceed threshold K. Resulting, DCTCP delivers the same or better throughput than normal TCP, while using 90% less buffer space. It was found in [2] that DCTCP enables the applications to handle 10x the current background traffic, without impacting foreground traffic. Moreover, a 10x increase in foreground traffic did not cause any timeouts, and thus largely eliminates TCP incast collapse problems. The algorithm itself has already seen deployments in large production data centers since then. We did a long-term stress-test and analysis in a data center, short summary of our TCP incast tests with iperf compared to cubic: This test measured DCTCP throughput and latency and compared it with CUBIC throughput and latency for an incast scenario. In this test, 19 senders sent at maximum rate to a single receiver. The receiver simply ran iperf -s. The senders ran iperf -c -t 30. All senders started simultaneously (using local clocks synchronized by ntp). This test was repeated multiple times. Below shows the results from a single test. Other tests are similar. (DCTCP results were extremely consistent, CUBIC results show some variance induced by the TCP timeouts that CUBIC encountered.) For this test, we report statistics on the number of TCP timeouts, flow throughput, and traffic latency. 1) Timeouts (total over all flows, and per flow summaries): CUBIC DCTCP Total 3227 25 Mean 169.842 1.316 Median 183 1 Max 207 5 Min 123 0 Stddev 28.991 1.600 Timeout data is taken by measuring the net change in netstat -s "other TCP timeouts" reported. As a result, the timeout measurements above are not restricted to the test traffic, and we believe that it is likely that all of the "DCTCP timeouts" are actually timeouts for non-test traffic. We report them nevertheless. CUBIC will also include some non-test timeouts, but they are drawfed by bona fide test traffic timeouts for CUBIC. Clearly DCTCP does an excellent job of preventing TCP timeouts. DCTCP reduces timeouts by at least two orders of magnitude and may well have eliminated them in this scenario. 2) Throughput (per flow in Mbps): CUBIC DCTCP Mean 521.684 521.895 Median 464 523 Max 776 527 Min 403 519 Stddev 105.891 2.601 Fairness 0.962 0.999 Throughput data was simply the average throughput for each flow reported by iperf. By avoiding TCP timeouts, DCTCP is able to achieve much better per-flow results. In CUBIC, many flows experience TCP timeouts which makes flow throughput unpredictable and unfair. DCTCP, on the other hand, provides very clean predictable throughput without incurring TCP timeouts. Thus, the standard deviation of CUBIC throughput is dramatically higher than the standard deviation of DCTCP throughput. Mean throughput is nearly identical because even though cubic flows suffer TCP timeouts, other flows will step in and fill the unused bandwidth. Note that this test is something of a best case scenario for incast under CUBIC: it allows other flows to fill in for flows experiencing a timeout. Under situations where the receiver is issuing requests and then waiting for all flows to complete, flows cannot fill in for timed out flows and throughput will drop dramatically. 3) Latency (in ms): CUBIC DCTCP Mean 4.0088 0.04219 Median 4.055 0.0395 Max 4.2 0.085 Min 3.32 0.028 Stddev 0.1666 0.01064 Latency for each protocol was computed by running "ping -i 0.2 " from a single sender to the receiver during the incast test. For DCTCP, "ping -Q 0x6 -i 0.2 " was used to ensure that traffic traversed the DCTCP queue and was not dropped when the queue size was greater than the marking threshold. The summary statistics above are over all ping metrics measured between the single sender, receiver pair. The latency results for this test show a dramatic difference between CUBIC and DCTCP. CUBIC intentionally overflows the switch buffer which incurs the maximum queue latency (more buffer memory will lead to high latency.) DCTCP, on the other hand, deliberately attempts to keep queue occupancy low. The result is a two orders of magnitude reduction of latency with DCTCP - even with a switch with relatively little RAM. Switches with larger amounts of RAM will incur increasing amounts of latency for CUBIC, but not for DCTCP. 4) Convergence and stability test: This test measured the time that DCTCP took to fairly redistribute bandwidth when a new flow commences. It also measured DCTCP's ability to remain stable at a fair bandwidth distribution. DCTCP is compared with CUBIC for this test. At the commencement of this test, a single flow is sending at maximum rate (near 10 Gbps) to a single receiver. One second after that first flow commences, a new flow from a distinct server begins sending to the same receiver as the first flow. After the second flow has sent data for 10 seconds, the second flow is terminated. The first flow sends for an additional second. Ideally, the bandwidth would be evenly shared as soon as the second flow starts, and recover as soon as it stops. The results of this test are shown below. Note that the flow bandwidth for the two flows was measured near the same time, but not simultaneously. DCTCP performs nearly perfectly within the measurement limitations of this test: bandwidth is quickly distributed fairly between the two flows, remains stable throughout the duration of the test, and recovers quickly. CUBIC, in contrast, is slow to divide the bandwidth fairly, and has trouble remaining stable. CUBIC DCTCP Seconds Flow 1 Flow 2 Seconds Flow 1 Flow 2 0 9.93 0 0 9.92 0 0.5 9.87 0 0.5 9.86 0 1 8.73 2.25 1 6.46 4.88 1.5 7.29 2.8 1.5 4.9 4.99 2 6.96 3.1 2 4.92 4.94 2.5 6.67 3.34 2.5 4.93 5 3 6.39 3.57 3 4.92 4.99 3.5 6.24 3.75 3.5 4.94 4.74 4 6 3.94 4 5.34 4.71 4.5 5.88 4.09 4.5 4.99 4.97 5 5.27 4.98 5 4.83 5.01 5.5 4.93 5.04 5.5 4.89 4.99 6 4.9 4.99 6 4.92 5.04 6.5 4.93 5.1 6.5 4.91 4.97 7 4.28 5.8 7 4.97 4.97 7.5 4.62 4.91 7.5 4.99 4.82 8 5.05 4.45 8 5.16 4.76 8.5 5.93 4.09 8.5 4.94 4.98 9 5.73 4.2 9 4.92 5.02 9.5 5.62 4.32 9.5 4.87 5.03 10 6.12 3.2 10 4.91 5.01 10.5 6.91 3.11 10.5 4.87 5.04 11 8.48 0 11 8.49 4.94 11.5 9.87 0 11.5 9.9 0 SYN/ACK ECT test: This test demonstrates the importance of ECT on SYN and SYN-ACK packets by measuring the connection probability in the presence of competing flows for a DCTCP connection attempt *without* ECT in the SYN packet. The test was repeated five times for each number of competing flows. Competing Flows 1 | 2 | 4 | 8 | 16 ------------------------------ Mean Connection Probability 1 | 0.67 | 0.45 | 0.28 | 0 Median Connection Probability 1 | 0.65 | 0.45 | 0.25 | 0 As the number of competing flows moves beyond 1, the connection probability drops rapidly. Enabling DCTCP with this patch requires the following steps: DCTCP must be running both on the sender and receiver side in your data center, i.e.: sysctl -w net.ipv4.tcp_congestion_control=dctcp Also, ECN functionality must be enabled on all switches in your data center for DCTCP to work. The default ECN marking threshold (K) heuristic on the switch for DCTCP is e.g., 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps (K > 1/7 * C * RTT, [4]). In above tests, for each switch port, traffic was segregated into two queues. For any packet with a DSCP of 0x01 - or equivalently a TOS of 0x04 - the packet was placed into the DCTCP queue. All other packets were placed into the default drop-tail queue. For the DCTCP queue, RED/ECN marking was enabled, here, with a marking threshold of 75 KB. More details however, we refer you to the paper [2] under section 3). There are no code changes required to applications running in user space. DCTCP has been implemented in full *isolation* of the rest of the TCP code as its own congestion control module, so that it can run without a need to expose code to the core of the TCP stack, and thus nothing changes for non-DCTCP users. Changes in the CA framework code are minimal, and DCTCP algorithm operates on mechanisms that are already available in most Silicon. The gain (dctcp_shift_g) is currently a fixed constant (1/16) from the paper, but we leave the option that it can be chosen carefully to a different value by the user. In case DCTCP is being used and ECN support on peer site is off, DCTCP falls back after 3WHS to operate in normal TCP Reno mode. ss {-4,-6} -t -i diag interface: ... dctcp wscale:7,7 rto:203 rtt:2.349/0.026 mss:1448 cwnd:2054 ssthresh:1102 ce_state 0 alpha 15 ab_ecn 0 ab_tot 735584 send 10129.2Mbps pacing_rate 20254.1Mbps unacked:1822 retrans:0/15 reordering:101 rcv_space:29200 ... dctcp-reno wscale:7,7 rto:201 rtt:0.711/1.327 ato:40 mss:1448 cwnd:10 ssthresh:1102 fallback_mode send 162.9Mbps pacing_rate 325.5Mbps rcv_rtt:1.5 rcv_space:29200 More information about DCTCP can be found in [1-4]. [1] http://simula.stanford.edu/~alizade/Site/DCTCP.html [2] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf [3] http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf [4] http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: Glenn Judd Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/dctcp.txt | 43 +++++ include/uapi/linux/inet_diag.h | 13 +- net/ipv4/Kconfig | 26 ++- net/ipv4/Makefile | 1 + net/ipv4/tcp_dctcp.c | 344 +++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_output.c | 1 + 6 files changed, 425 insertions(+), 3 deletions(-) create mode 100644 Documentation/networking/dctcp.txt create mode 100644 net/ipv4/tcp_dctcp.c (limited to 'include/uapi') diff --git a/Documentation/networking/dctcp.txt b/Documentation/networking/dctcp.txt new file mode 100644 index 000000000000..0d5dfbc89ec9 --- /dev/null +++ b/Documentation/networking/dctcp.txt @@ -0,0 +1,43 @@ +DCTCP (DataCenter TCP) +---------------------- + +DCTCP is an enhancement to the TCP congestion control algorithm for data +center networks and leverages Explicit Congestion Notification (ECN) in +the data center network to provide multi-bit feedback to the end hosts. + +To enable it on end hosts: + + sysctl -w net.ipv4.tcp_congestion_control=dctcp + +All switches in the data center network running DCTCP must support ECN +marking and be configured for marking when reaching defined switch buffer +thresholds. The default ECN marking threshold heuristic for DCTCP on +switches is 20 packets (30KB) at 1Gbps, and 65 packets (~100KB) at 10Gbps, +but might need further careful tweaking. + +For more details, see below documents: + +Paper: + +The algorithm is further described in detail in the following two +SIGCOMM/SIGMETRICS papers: + + i) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + "Data Center TCP (DCTCP)", Data Center Networks session + Proc. ACM SIGCOMM, New Delhi, 2010. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + http://www.sigcomm.org/ccr/papers/2010/October/1851275.1851192 + +ii) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + "Analysis of DCTCP: Stability, Convergence, and Fairness" + Proc. ACM SIGMETRICS, San Jose, 2011. + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + +IETF informational draft: + + http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-00 + +DCTCP site: + + http://simula.stanford.edu/~alizade/Site/DCTCP.html diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbde90fa5838..d65c0a09efd3 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -110,10 +110,10 @@ enum { INET_DIAG_TCLASS, INET_DIAG_SKMEMINFO, INET_DIAG_SHUTDOWN, + INET_DIAG_DCTCPINFO, }; -#define INET_DIAG_MAX INET_DIAG_SHUTDOWN - +#define INET_DIAG_MAX INET_DIAG_DCTCPINFO /* INET_DIAG_MEM */ @@ -133,5 +133,14 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; +/* INET_DIAG_DCTCPINFO */ + +struct tcp_dctcp_info { + __u16 dctcp_enabled; + __u16 dctcp_ce_state; + __u32 dctcp_alpha; + __u32 dctcp_ab_ecn; + __u32 dctcp_ab_tot; +}; #endif /* _UAPI_INET_DIAG_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 84f710b7472a..69fb37854449 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -570,6 +570,27 @@ config TCP_CONG_ILLINOIS For further details see: http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html +config TCP_CONG_DCTCP + tristate "DataCenter TCP (DCTCP)" + default n + ---help--- + DCTCP leverages Explicit Congestion Notification (ECN) in the network to + provide multi-bit feedback to the end hosts. It is designed to provide: + + - High burst tolerance (incast due to partition/aggregate), + - Low latency (short flows, queries), + - High throughput (continuous data updates, large file transfers) with + commodity, shallow-buffered switches. + + All switches in the data center network running DCTCP must support + ECN marking and be configured for marking when reaching defined switch + buffer thresholds. The default ECN marking threshold heuristic for + DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets + (~100KB) at 10Gbps, but might need further careful tweaking. + + For further details see: + http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + choice prompt "Default TCP congestion control" default DEFAULT_CUBIC @@ -598,9 +619,11 @@ choice config DEFAULT_WESTWOOD bool "Westwood" if TCP_CONG_WESTWOOD=y + config DEFAULT_DCTCP + bool "DCTCP" if TCP_CONG_DCTCP=y + config DEFAULT_RENO bool "Reno" - endchoice endif @@ -620,6 +643,7 @@ config DEFAULT_TCP_CONG default "westwood" if DEFAULT_WESTWOOD default "veno" if DEFAULT_VENO default "reno" if DEFAULT_RENO + default "dctcp" if DEFAULT_DCTCP default "cubic" config TCP_MD5SIG diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d78d404c596f..d8105787c199 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o +obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c new file mode 100644 index 000000000000..b504371af742 --- /dev/null +++ b/net/ipv4/tcp_dctcp.c @@ -0,0 +1,344 @@ +/* DataCenter TCP (DCTCP) congestion control. + * + * http://simula.stanford.edu/~alizade/Site/DCTCP.html + * + * This is an implementation of DCTCP over Reno, an enhancement to the + * TCP congestion control algorithm designed for data centers. DCTCP + * leverages Explicit Congestion Notification (ECN) in the network to + * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet + * the following three data center transport requirements: + * + * - High burst tolerance (incast due to partition/aggregate) + * - Low latency (short flows, queries) + * - High throughput (continuous data updates, large file transfers) + * with commodity shallow buffered switches + * + * The algorithm is described in detail in the following two papers: + * + * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye, + * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan: + * "Data Center TCP (DCTCP)", Data Center Networks session + * Proc. ACM SIGCOMM, New Delhi, 2010. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf + * + * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar: + * "Analysis of DCTCP: Stability, Convergence, and Fairness" + * Proc. ACM SIGMETRICS, San Jose, 2011. + * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf + * + * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh. + * + * Authors: + * + * Daniel Borkmann + * Florian Westphal + * Glenn Judd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include +#include +#include + +#define DCTCP_MAX_ALPHA 1024U + +struct dctcp { + u32 acked_bytes_ecn; + u32 acked_bytes_total; + u32 prior_snd_una; + u32 prior_rcv_nxt; + u32 dctcp_alpha; + u32 next_seq; + u32 ce_state; + u32 delayed_ack_reserved; +}; + +static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ +module_param(dctcp_shift_g, uint, 0644); +MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha"); + +static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA; +module_param(dctcp_alpha_on_init, uint, 0644); +MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value"); + +static unsigned int dctcp_clamp_alpha_on_loss __read_mostly; +module_param(dctcp_clamp_alpha_on_loss, uint, 0644); +MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss, + "parameter for clamping alpha on loss"); + +static struct tcp_congestion_ops dctcp_reno; + +static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) +{ + ca->next_seq = tp->snd_nxt; + + ca->acked_bytes_ecn = 0; + ca->acked_bytes_total = 0; +} + +static void dctcp_init(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if ((tp->ecn_flags & TCP_ECN_OK) || + (sk->sk_state == TCP_LISTEN || + sk->sk_state == TCP_CLOSE)) { + struct dctcp *ca = inet_csk_ca(sk); + + ca->prior_snd_una = tp->snd_una; + ca->prior_rcv_nxt = tp->rcv_nxt; + + ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); + + ca->delayed_ack_reserved = 0; + ca->ce_state = 0; + + dctcp_reset(tp, ca); + return; + } + + /* No ECN support? Fall back to Reno. Also need to clear + * ECT from sk since it is set during 3WHS for DCTCP. + */ + inet_csk(sk)->icsk_ca_ops = &dctcp_reno; + INET_ECN_dontxmit(sk); +} + +static u32 dctcp_ssthresh(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ + +static void dctcp_ce_state_0_to_1(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ + if (!ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=0. */ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; + + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +} + +static void dctcp_ce_state_1_to_0(struct sock *sk) +{ + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ + if (ca->ce_state && ca->delayed_ack_reserved) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ + tmp_rcv_nxt = tp->rcv_nxt; + + /* Generate previous ack with CE=1. */ + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + tp->rcv_nxt = ca->prior_rcv_nxt; + + tcp_send_ack(sk); + + /* Recover current rcv_nxt. */ + tp->rcv_nxt = tmp_rcv_nxt; + } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; + + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static void dctcp_update_alpha(struct sock *sk, u32 flags) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct dctcp *ca = inet_csk_ca(sk); + u32 acked_bytes = tp->snd_una - ca->prior_snd_una; + + /* If ack did not advance snd_una, count dupack as MSS size. + * If ack did update window, do not count it at all. + */ + if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE)) + acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss; + if (acked_bytes) { + ca->acked_bytes_total += acked_bytes; + ca->prior_snd_una = tp->snd_una; + + if (flags & CA_ACK_ECE) + ca->acked_bytes_ecn += acked_bytes; + } + + /* Expired RTT */ + if (!before(tp->snd_una, ca->next_seq)) { + /* For avoiding denominator == 1. */ + if (ca->acked_bytes_total == 0) + ca->acked_bytes_total = 1; + + /* alpha = (1 - g) * alpha + g * F */ + ca->dctcp_alpha = ca->dctcp_alpha - + (ca->dctcp_alpha >> dctcp_shift_g) + + (ca->acked_bytes_ecn << (10U - dctcp_shift_g)) / + ca->acked_bytes_total; + + if (ca->dctcp_alpha > DCTCP_MAX_ALPHA) + /* Clamp dctcp_alpha to max. */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + + dctcp_reset(tp, ca); + } +} + +static void dctcp_state(struct sock *sk, u8 new_state) +{ + if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) { + struct dctcp *ca = inet_csk_ca(sk); + + /* If this extension is enabled, we clamp dctcp_alpha to + * max on packet loss; the motivation is that dctcp_alpha + * is an indicator to the extend of congestion and packet + * loss is an indicator of extreme congestion; setting + * this in practice turned out to be beneficial, and + * effectively assumes total congestion which reduces the + * window by half. + */ + ca->dctcp_alpha = DCTCP_MAX_ALPHA; + } +} + +static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) +{ + struct dctcp *ca = inet_csk_ca(sk); + + switch (ev) { + case CA_EVENT_DELAYED_ACK: + if (!ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 1; + break; + case CA_EVENT_NON_DELAYED_ACK: + if (ca->delayed_ack_reserved) + ca->delayed_ack_reserved = 0; + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) +{ + switch (ev) { + case CA_EVENT_ECN_IS_CE: + dctcp_ce_state_0_to_1(sk); + break; + case CA_EVENT_ECN_NO_CE: + dctcp_ce_state_1_to_0(sk); + break; + case CA_EVENT_DELAYED_ACK: + case CA_EVENT_NON_DELAYED_ACK: + dctcp_update_ack_reserved(sk, ev); + break; + default: + /* Don't care for the rest. */ + break; + } +} + +static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + /* Fill it also in case of VEGASINFO due to req struct limits. + * We can still correctly retrieve it later. + */ + if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcp_dctcp_info info; + + memset(&info, 0, sizeof(info)); + if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) { + info.dctcp_enabled = 1; + info.dctcp_ce_state = (u16) ca->ce_state; + info.dctcp_alpha = ca->dctcp_alpha; + info.dctcp_ab_ecn = ca->acked_bytes_ecn; + info.dctcp_ab_tot = ca->acked_bytes_total; + } + + nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info); + } +} + +static struct tcp_congestion_ops dctcp __read_mostly = { + .init = dctcp_init, + .in_ack_event = dctcp_update_alpha, + .cwnd_event = dctcp_cwnd_event, + .ssthresh = dctcp_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .set_state = dctcp_state, + .get_info = dctcp_get_info, + .flags = TCP_CONG_NEEDS_ECN, + .owner = THIS_MODULE, + .name = "dctcp", +}; + +static struct tcp_congestion_ops dctcp_reno __read_mostly = { + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .get_info = dctcp_get_info, + .owner = THIS_MODULE, + .name = "dctcp-reno", +}; + +static int __init dctcp_register(void) +{ + BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&dctcp); +} + +static void __exit dctcp_unregister(void) +{ + tcp_unregister_congestion_control(&dctcp); +} + +module_init(dctcp_register); +module_exit(dctcp_unregister); + +MODULE_AUTHOR("Daniel Borkmann "); +MODULE_AUTHOR("Florian Westphal "); +MODULE_AUTHOR("Glenn Judd "); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("DataCenter TCP (DCTCP)"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 124f9e4e4594..86a0216fcaa1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3211,6 +3211,7 @@ void tcp_send_ack(struct sock *sk) skb_mstamp_get(&buff->skb_mstamp); tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } +EXPORT_SYMBOL_GPL(tcp_send_ack); /* This routine sends a packet with an out of date sequence * number. It assumes the other end will try to ack it. -- cgit v1.2.3 From f5c9ecebaf2a2c9381973798e389cc019dd983e0 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 29 Sep 2014 10:06:19 -0600 Subject: vfio/iommu_type1: add new VFIO_TYPE1_NESTING_IOMMU IOMMU type VFIO allows devices to be safely handed off to userspace by putting them behind an IOMMU configured to ensure DMA and interrupt isolation. This enables userspace KVM clients, such as kvmtool and qemu, to further map the device into a virtual machine. With IOMMUs such as the ARM SMMU, it is then possible to provide SMMU translation services to the guest operating system, which are nested with the existing translation installed by VFIO. However, enabling this feature means that the IOMMU driver must be informed that the VFIO domain is being created for the purposes of nested translation. This patch adds a new IOMMU type (VFIO_TYPE1_NESTING_IOMMU) to the VFIO type-1 driver. The new IOMMU type acts identically to the VFIO_TYPE1v2_IOMMU type, but additionally sets the DOMAIN_ATTR_NESTING attribute on its IOMMU domains. Cc: Joerg Roedel Signed-off-by: Will Deacon Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 30 +++++++++++++++++++++++++----- include/uapi/linux/vfio.h | 3 +++ 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0734fbe5b651..583ccdb2c58f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -57,7 +57,8 @@ struct vfio_iommu { struct list_head domain_list; struct mutex lock; struct rb_root dma_list; - bool v2; + bool v2; + bool nesting; }; struct vfio_domain { @@ -705,6 +706,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_free; } + if (iommu->nesting) { + int attr = 1; + + ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING, + &attr); + if (ret) + goto out_domain; + } + ret = iommu_attach_group(domain->domain, iommu_group); if (ret) goto out_domain; @@ -819,17 +829,26 @@ static void *vfio_iommu_type1_open(unsigned long arg) { struct vfio_iommu *iommu; - if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU) - return ERR_PTR(-EINVAL); - iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); if (!iommu) return ERR_PTR(-ENOMEM); + switch (arg) { + case VFIO_TYPE1_IOMMU: + break; + case VFIO_TYPE1_NESTING_IOMMU: + iommu->nesting = true; + case VFIO_TYPE1v2_IOMMU: + iommu->v2 = true; + break; + default: + kfree(iommu); + return ERR_PTR(-EINVAL); + } + INIT_LIST_HEAD(&iommu->domain_list); iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); - iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU); return iommu; } @@ -885,6 +904,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, switch (arg) { case VFIO_TYPE1_IOMMU: case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_NESTING_IOMMU: return 1; case VFIO_DMA_CC_IOMMU: if (!iommu) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 6612974c64bf..29715d27548f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -33,6 +33,9 @@ /* Check if EEH is supported */ #define VFIO_EEH 5 +/* Two-stage IOMMU */ +#define VFIO_TYPE1_NESTING_IOMMU 6 /* Implies v2 */ + /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between -- cgit v1.2.3 From 79cf79abce71eb7dbc40e2f3121048ca5405cb47 Mon Sep 17 00:00:00 2001 From: Michael Braun Date: Thu, 25 Sep 2014 16:31:08 +0200 Subject: macvlan: add source mode This patch adds a new mode of operation to macvlan, called "source". It allows one to set a list of allowed mac address, which is used to match against source mac address from received frames on underlying interface. This enables creating mac based VLAN associations, instead of standard port or tag based. The feature is useful to deploy 802.1x mac based behavior, where drivers of underlying interfaces doesn't allows that. Configuration is done through the netlink interface using e.g.: ip link add link eth0 name macvlan0 type macvlan mode source ip link add link eth0 name macvlan1 type macvlan mode source ip link set link dev macvlan0 type macvlan macaddr add 00:11:11:11:11:11 ip link set link dev macvlan0 type macvlan macaddr add 00:22:22:22:22:22 ip link set link dev macvlan0 type macvlan macaddr add 00:33:33:33:33:33 ip link set link dev macvlan1 type macvlan macaddr add 00:33:33:33:33:33 ip link set link dev macvlan1 type macvlan macaddr add 00:44:44:44:44:44 This allows clients with MAC addresses 00:11:11:11:11:11, 00:22:22:22:22:22 to be part of only VLAN associated with macvlan0 interface. Clients with MAC addresses 00:44:44:44:44:44 with only VLAN associated with macvlan1 interface. And client with MAC address 00:33:33:33:33:33 to be associated with both VLANs. Based on work of Stefan Gula v8: last version of Stefan Gula for Kernel 3.2.1 v9: rework onto linux-next 2014-03-12 by Michael Braun add MACADDR_SET command, enable to configure mac for source mode while creating interface v10: - reduce indention level - rename source_list to source_entry - use aligned 64bit ether address - use hash_64 instead of addr[5] v11: - rebase for 3.14 / linux-next 20.04.2014 v12 - rebase for linux-next 2014-09-25 Signed-off-by: Michael Braun Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 304 ++++++++++++++++++++++++++++++++++++++++++- include/linux/if_macvlan.h | 1 + include/uapi/linux/if_link.h | 12 ++ 3 files changed, 314 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 726edabff26b..e8a453f1b458 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -35,7 +35,8 @@ #include #include -#define MACVLAN_HASH_SIZE (1 << BITS_PER_BYTE) +#define MACVLAN_HASH_BITS 8 +#define MACVLAN_HASH_SIZE (1<>= 16; +#else + value <<= 16; +#endif + return hash_64(value, MACVLAN_HASH_BITS); +} + static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); @@ -73,20 +96,68 @@ static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port, const unsigned char *addr) { struct macvlan_dev *vlan; + u32 idx = macvlan_eth_hash(addr); - hlist_for_each_entry_rcu(vlan, &port->vlan_hash[addr[5]], hlist) { + hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist) { if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr)) return vlan; } return NULL; } +static struct macvlan_source_entry *macvlan_hash_lookup_source( + const struct macvlan_dev *vlan, + const unsigned char *addr) +{ + struct macvlan_source_entry *entry; + u32 idx = macvlan_eth_hash(addr); + struct hlist_head *h = &vlan->port->vlan_source_hash[idx]; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (ether_addr_equal_64bits(entry->addr, addr) && + entry->vlan == vlan) + return entry; + } + return NULL; +} + +static int macvlan_hash_add_source(struct macvlan_dev *vlan, + const unsigned char *addr) +{ + struct macvlan_port *port = vlan->port; + struct macvlan_source_entry *entry; + struct hlist_head *h; + + entry = macvlan_hash_lookup_source(vlan, addr); + if (entry) + return 0; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + ether_addr_copy(entry->addr, addr); + entry->vlan = vlan; + h = &port->vlan_source_hash[macvlan_eth_hash(addr)]; + hlist_add_head_rcu(&entry->hlist, h); + vlan->macaddr_count++; + + return 0; +} + static void macvlan_hash_add(struct macvlan_dev *vlan) { struct macvlan_port *port = vlan->port; const unsigned char *addr = vlan->dev->dev_addr; + u32 idx = macvlan_eth_hash(addr); - hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[addr[5]]); + hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]); +} + +static void macvlan_hash_del_source(struct macvlan_source_entry *entry) +{ + hlist_del_rcu(&entry->hlist); + kfree_rcu(entry, rcu); } static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync) @@ -267,6 +338,65 @@ err: atomic_long_inc(&skb->dev->rx_dropped); } +static void macvlan_flush_sources(struct macvlan_port *port, + struct macvlan_dev *vlan) +{ + int i; + + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { + struct hlist_node *h, *n; + + hlist_for_each_safe(h, n, &port->vlan_source_hash[i]) { + struct macvlan_source_entry *entry; + + entry = hlist_entry(h, struct macvlan_source_entry, + hlist); + if (entry->vlan == vlan) + macvlan_hash_del_source(entry); + } + } + vlan->macaddr_count = 0; +} + +static void macvlan_forward_source_one(struct sk_buff *skb, + struct macvlan_dev *vlan) +{ + struct sk_buff *nskb; + struct net_device *dev; + int len; + int ret; + + dev = vlan->dev; + if (unlikely(!(dev->flags & IFF_UP))) + return; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return; + + len = nskb->len + ETH_HLEN; + nskb->dev = dev; + nskb->pkt_type = PACKET_HOST; + + ret = netif_rx(nskb); + macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, 0); +} + +static void macvlan_forward_source(struct sk_buff *skb, + struct macvlan_port *port, + const unsigned char *addr) +{ + struct macvlan_source_entry *entry; + u32 idx = macvlan_eth_hash(addr); + struct hlist_head *h = &port->vlan_source_hash[idx]; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (ether_addr_equal_64bits(entry->addr, addr)) + if (entry->vlan->dev->flags & IFF_UP) + macvlan_forward_source_one(skb, entry->vlan); + } +} + /* called under rcu_read_lock() from netif_receive_skb */ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) { @@ -285,6 +415,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) if (!skb) return RX_HANDLER_CONSUMED; eth = eth_hdr(skb); + macvlan_forward_source(skb, port, eth->h_source); src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -301,6 +432,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } + macvlan_forward_source(skb, port, eth->h_source); if (port->passthru) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -667,6 +799,7 @@ static void macvlan_uninit(struct net_device *dev) free_percpu(vlan->pcpu_stats); + macvlan_flush_sources(port, vlan); port->count -= 1; if (!port->count) macvlan_port_destroy(port->dev); @@ -925,6 +1058,8 @@ static int macvlan_port_create(struct net_device *dev) INIT_LIST_HEAD(&port->vlans); for (i = 0; i < MACVLAN_HASH_SIZE; i++) INIT_HLIST_HEAD(&port->vlan_hash[i]); + for (i = 0; i < MACVLAN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&port->vlan_source_hash[i]); skb_queue_head_init(&port->bc_queue); INIT_WORK(&port->bc_work, macvlan_process_broadcast); @@ -966,11 +1101,102 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[]) case MACVLAN_MODE_VEPA: case MACVLAN_MODE_BRIDGE: case MACVLAN_MODE_PASSTHRU: + case MACVLAN_MODE_SOURCE: + break; + default: + return -EINVAL; + } + } + + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) { + case MACVLAN_MACADDR_ADD: + case MACVLAN_MACADDR_DEL: + case MACVLAN_MACADDR_FLUSH: + case MACVLAN_MACADDR_SET: break; default: return -EINVAL; } } + + if (data && data[IFLA_MACVLAN_MACADDR]) { + if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN) + return -EINVAL; + + if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR]))) + return -EADDRNOTAVAIL; + } + + if (data && data[IFLA_MACVLAN_MACADDR_COUNT]) + return -EINVAL; + + return 0; +} + +/** + * reconfigure list of remote source mac address + * (only for macvlan devices in source mode) + * Note regarding alignment: all netlink data is aligned to 4 Byte, which + * suffices for both ether_addr_copy and ether_addr_equal_64bits usage. + */ +static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode, + struct nlattr *data[]) +{ + char *addr = NULL; + int ret, rem, len; + struct nlattr *nla, *head; + struct macvlan_source_entry *entry; + + if (data[IFLA_MACVLAN_MACADDR]) + addr = nla_data(data[IFLA_MACVLAN_MACADDR]); + + if (mode == MACVLAN_MACADDR_ADD) { + if (!addr) + return -EINVAL; + + return macvlan_hash_add_source(vlan, addr); + + } else if (mode == MACVLAN_MACADDR_DEL) { + if (!addr) + return -EINVAL; + + entry = macvlan_hash_lookup_source(vlan, addr); + if (entry) { + macvlan_hash_del_source(entry); + vlan->macaddr_count--; + } + } else if (mode == MACVLAN_MACADDR_FLUSH) { + macvlan_flush_sources(vlan->port, vlan); + } else if (mode == MACVLAN_MACADDR_SET) { + macvlan_flush_sources(vlan->port, vlan); + + if (addr) { + ret = macvlan_hash_add_source(vlan, addr); + if (ret) + return ret; + } + + if (!data || !data[IFLA_MACVLAN_MACADDR_DATA]) + return 0; + + head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]); + len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]); + + nla_for_each_attr(nla, head, len, rem) { + if (nla_type(nla) != IFLA_MACVLAN_MACADDR || + nla_len(nla) != ETH_ALEN) + continue; + + addr = nla_data(nla); + ret = macvlan_hash_add_source(vlan, addr); + if (ret) + return ret; + } + } else { + return -EINVAL; + } + return 0; } @@ -981,6 +1207,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct macvlan_port *port; struct net_device *lowerdev; int err; + int macmode; if (!tb[IFLA_LINK]) return -EINVAL; @@ -1034,6 +1261,15 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_inherit(dev, lowerdev); } + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (vlan->mode != MACVLAN_MODE_SOURCE) + return -EINVAL; + macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); + err = macvlan_changelink_sources(vlan, macmode, data); + if (err) + return err; + } + port->count += 1; err = register_netdevice(dev); if (err < 0) @@ -1070,6 +1306,8 @@ void macvlan_dellink(struct net_device *dev, struct list_head *head) { struct macvlan_dev *vlan = netdev_priv(dev); + if (vlan->mode == MACVLAN_MODE_SOURCE) + macvlan_flush_sources(vlan->port, vlan); list_del_rcu(&vlan->list); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(vlan->lowerdev, dev); @@ -1082,6 +1320,8 @@ static int macvlan_changelink(struct net_device *dev, struct macvlan_dev *vlan = netdev_priv(dev); enum macvlan_mode mode; bool set_mode = false; + enum macvlan_macaddr_mode macmode; + int ret; /* Validate mode, but don't set yet: setting flags may fail. */ if (data && data[IFLA_MACVLAN_MODE]) { @@ -1091,6 +1331,9 @@ static int macvlan_changelink(struct net_device *dev, if ((mode == MACVLAN_MODE_PASSTHRU) != (vlan->mode == MACVLAN_MODE_PASSTHRU)) return -EINVAL; + if (vlan->mode == MACVLAN_MODE_SOURCE && + vlan->mode != mode) + macvlan_flush_sources(vlan->port, vlan); } if (data && data[IFLA_MACVLAN_FLAGS]) { @@ -1110,26 +1353,77 @@ static int macvlan_changelink(struct net_device *dev, } if (set_mode) vlan->mode = mode; + if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { + if (vlan->mode != MACVLAN_MODE_SOURCE) + return -EINVAL; + macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); + ret = macvlan_changelink_sources(vlan, macmode, data); + if (ret) + return ret; + } return 0; } +static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan) +{ + if (vlan->macaddr_count == 0) + return 0; + return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */ + + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN); +} + static size_t macvlan_get_size(const struct net_device *dev) { + struct macvlan_dev *vlan = netdev_priv(dev); + return (0 + nla_total_size(4) /* IFLA_MACVLAN_MODE */ + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */ + + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */ + + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */ ); } +static int macvlan_fill_info_macaddr(struct sk_buff *skb, + const struct macvlan_dev *vlan, + const int i) +{ + struct hlist_head *h = &vlan->port->vlan_source_hash[i]; + struct macvlan_source_entry *entry; + + hlist_for_each_entry_rcu(entry, h, hlist) { + if (entry->vlan != vlan) + continue; + if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr)) + return 1; + } + return 0; +} + static int macvlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); + int i; + struct nlattr *nest; if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags)) goto nla_put_failure; + if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count)) + goto nla_put_failure; + if (vlan->macaddr_count > 0) { + nest = nla_nest_start(skb, IFLA_MACVLAN_MACADDR_DATA); + if (nest == NULL) + goto nla_put_failure; + + for (i = 0; i < MACVLAN_HASH_SIZE; i++) { + if (macvlan_fill_info_macaddr(skb, vlan, i)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + } return 0; nla_put_failure: @@ -1139,6 +1433,10 @@ nla_put_failure: static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = { [IFLA_MACVLAN_MODE] = { .type = NLA_U32 }, [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 }, + [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 }, + [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, + [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED }, + [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 }, }; int macvlan_link_register(struct rtnl_link_ops *ops) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 6b2c7cf352a5..6f6929ea8a0c 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -60,6 +60,7 @@ struct macvlan_dev { #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif + unsigned int macaddr_count; }; static inline void macvlan_count_rx(const struct macvlan_dev *vlan, diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index c80f95f6ee78..0bdb77e16875 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -303,6 +303,10 @@ enum { IFLA_MACVLAN_UNSPEC, IFLA_MACVLAN_MODE, IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, __IFLA_MACVLAN_MAX, }; @@ -313,6 +317,14 @@ enum macvlan_mode { MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, }; #define MACVLAN_FLAG_NOPROMISC 1 -- cgit v1.2.3 From 51b0a5d8c21a91801bbef9bcc8639dc0b206c6cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 26 Sep 2014 14:35:14 +0200 Subject: netfilter: nft_reject: introduce icmp code abstraction for inet and bridge This patch introduces the NFT_REJECT_ICMPX_UNREACH type which provides an abstraction to the ICMP and ICMPv6 codes that you can use from the inet and bridge tables, they are: * NFT_REJECT_ICMPX_NO_ROUTE: no route to host - network unreachable * NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable * NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable * NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratevely prohibited You can still use the specific codes when restricting the rule to match the corresponding layer 3 protocol. I decided to not overload the existing NFT_REJECT_ICMP_UNREACH to have different semantics depending on the table family and to allow the user to specify ICMP family specific codes if they restrict it to the corresponding family. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_reject.h | 1 + include/net/netfilter/nft_reject.h | 9 +-- include/uapi/linux/netfilter/nf_tables.h | 21 +++++++ net/bridge/netfilter/nft_reject_bridge.c | 95 ++++++++++++++++++++++++++++++-- net/ipv4/netfilter/nft_reject_ipv4.c | 1 - net/netfilter/nft_reject.c | 37 +++++++++++++ net/netfilter/nft_reject_inet.c | 94 +++++++++++++++++++++++++++++-- 7 files changed, 241 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index f713b5a31d62..8ce06385a552 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -5,6 +5,7 @@ #include #include #include +#include static inline void nf_send_unreach(struct sk_buff *skb_in, int code) { diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 36b0da2d55bb..60fa1530006b 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -14,12 +14,7 @@ int nft_reject_init(const struct nft_ctx *ctx, int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); -void nft_reject_ipv4_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt); - -void nft_reject_ipv6_eval(const struct nft_expr *expr, - struct nft_data data[NFT_REG_MAX + 1], - const struct nft_pktinfo *pkt); +int nft_reject_icmp_code(u8 code); +int nft_reject_icmpv6_code(u8 code); #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b72ccfeaf865..c26df6787fb0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,12 +749,33 @@ enum nft_queue_attributes { * * @NFT_REJECT_ICMP_UNREACH: reject using ICMP unreachable * @NFT_REJECT_TCP_RST: reject using TCP RST + * @NFT_REJECT_ICMPX_UNREACH: abstracted ICMP unreachable for bridge and inet */ enum nft_reject_types { NFT_REJECT_ICMP_UNREACH, NFT_REJECT_TCP_RST, + NFT_REJECT_ICMPX_UNREACH, }; +/** + * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * + * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable + * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable + * @NFT_REJECT_ICMPX_HOST_UNREACH: host unreachable + * @NFT_REJECT_ICMPX_ADMIN_PROHIBITED: administratively prohibited + * + * These codes are mapped to real ICMP and ICMPv6 codes. + */ +enum nft_reject_inet_code { + NFT_REJECT_ICMPX_NO_ROUTE = 0, + NFT_REJECT_ICMPX_PORT_UNREACH, + NFT_REJECT_ICMPX_HOST_UNREACH, + NFT_REJECT_ICMPX_ADMIN_PROHIBITED, + __NFT_REJECT_ICMPX_MAX +}; +#define NFT_REJECT_ICMPX_MAX (__NFT_REJECT_ICMPX_MAX + 1) + /** * enum nft_reject_attributes - nf_tables reject expression netlink attributes * diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index ee3ffe93e14e..a76479535df2 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -14,21 +14,106 @@ #include #include #include +#include +#include static void nft_reject_bridge_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (eth_hdr(pkt->skb)->h_proto) { case htons(ETH_P_IP): - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case htons(ETH_P_IPV6): - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; default: /* No explicit way to reject this protocol, drop it. */ - data[NFT_REG_VERDICT].verdict = NF_DROP; break; } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_bridge_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; + } + return 0; +} + +static int nft_reject_bridge_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_bridge_type; @@ -36,8 +121,8 @@ static const struct nft_expr_ops nft_reject_bridge_ops = { .type = &nft_reject_bridge_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_bridge_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_bridge_init, + .dump = nft_reject_bridge_dump, }; static struct nft_expr_type nft_reject_bridge_type __read_mostly = { diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index e79718a382f2..ed33299c56d1 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index f3448c296446..ec8a456092a7 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = { .type = NLA_U32 }, @@ -70,5 +72,40 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_reject_dump); +static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, +}; + +int nft_reject_icmp_code(u8 code) +{ + if (code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + return icmp_code_v4[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmp_code); + + +static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX] = { + [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, + [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, + [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, + [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, +}; + +int nft_reject_icmpv6_code(u8 code) +{ + if (code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + return icmp_code_v6[code]; +} + +EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index b718a52a4654..7b5f9d58680a 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -14,17 +14,103 @@ #include #include #include +#include +#include static void nft_reject_inet_eval(const struct nft_expr *expr, struct nft_data data[NFT_REG_MAX + 1], const struct nft_pktinfo *pkt) { + struct nft_reject *priv = nft_expr_priv(expr); + struct net *net = dev_net((pkt->in != NULL) ? pkt->in : pkt->out); + switch (pkt->ops->pf) { case NFPROTO_IPV4: - return nft_reject_ipv4_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach(pkt->skb, + nft_reject_icmp_code(priv->icmp_code)); + break; + } + break; case NFPROTO_IPV6: - return nft_reject_ipv6_eval(expr, data, pkt); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach6(net, pkt->skb, priv->icmp_code, + pkt->ops->hooknum); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset6(net, pkt->skb, pkt->ops->hooknum); + break; + case NFT_REJECT_ICMPX_UNREACH: + nf_send_unreach6(net, pkt->skb, + nft_reject_icmpv6_code(priv->icmp_code), + pkt->ops->hooknum); + break; + } + break; + } + data[NFT_REG_VERDICT].verdict = NF_DROP; +} + +static int nft_reject_inet_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_reject *priv = nft_expr_priv(expr); + int icmp_code; + + if (tb[NFTA_REJECT_TYPE] == NULL) + return -EINVAL; + + priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (tb[NFTA_REJECT_ICMP_CODE] == NULL) + return -EINVAL; + + icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); + if (priv->type == NFT_REJECT_ICMPX_UNREACH && + icmp_code > NFT_REJECT_ICMPX_MAX) + return -EINVAL; + + priv->icmp_code = icmp_code; + break; + case NFT_REJECT_TCP_RST: + break; + default: + return -EINVAL; } + return 0; +} + +static int nft_reject_inet_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_reject *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) + goto nla_put_failure; + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + case NFT_REJECT_ICMPX_UNREACH: + if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) + goto nla_put_failure; + break; + } + + return 0; + +nla_put_failure: + return -1; } static struct nft_expr_type nft_reject_inet_type; @@ -32,8 +118,8 @@ static const struct nft_expr_ops nft_reject_inet_ops = { .type = &nft_reject_inet_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), .eval = nft_reject_inet_eval, - .init = nft_reject_init, - .dump = nft_reject_dump, + .init = nft_reject_inet_init, + .dump = nft_reject_inet_dump, }; static struct nft_expr_type nft_reject_inet_type __read_mostly = { -- cgit v1.2.3 From dba4b74d2da8798626e2b702ad3f452671e335f7 Mon Sep 17 00:00:00 2001 From: Vladimir Kondratiev Date: Wed, 1 Oct 2014 15:05:25 +0300 Subject: wil6210: atomic I/O for the card memory Introduce netdev IOCTLs, to be used by the debug tools. Allows to read/write single dword value or memory block, aligned to dword Different address modes supported: - BAR offset - Firmware "linker" address - target's AHB bus Signed-off-by: Vladimir Kondratiev Signed-off-by: John W. Linville --- MAINTAINERS | 1 + drivers/net/wireless/ath/wil6210/Makefile | 1 + drivers/net/wireless/ath/wil6210/ioctl.c | 173 +++++++++++++++++++++++++++++ drivers/net/wireless/ath/wil6210/netdev.c | 12 ++ drivers/net/wireless/ath/wil6210/wil6210.h | 2 + include/uapi/linux/wil6210_uapi.h | 87 +++++++++++++++ 6 files changed, 276 insertions(+) create mode 100644 drivers/net/wireless/ath/wil6210/ioctl.c create mode 100644 include/uapi/linux/wil6210_uapi.h (limited to 'include/uapi') diff --git a/MAINTAINERS b/MAINTAINERS index 482888d80431..66de6b2923d4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1611,6 +1611,7 @@ L: wil6210@qca.qualcomm.com S: Supported W: http://wireless.kernel.org/en/users/Drivers/wil6210 F: drivers/net/wireless/ath/wil6210/ +F: include/uapi/linux/wil6210_uapi.h CARL9170 LINUX COMMUNITY WIRELESS DRIVER M: Christian Lamparter diff --git a/drivers/net/wireless/ath/wil6210/Makefile b/drivers/net/wireless/ath/wil6210/Makefile index 05f29a6b1ba8..8ad4b5f97e04 100644 --- a/drivers/net/wireless/ath/wil6210/Makefile +++ b/drivers/net/wireless/ath/wil6210/Makefile @@ -10,6 +10,7 @@ wil6210-y += interrupt.o wil6210-y += txrx.o wil6210-y += debug.o wil6210-y += rx_reorder.o +wil6210-y += ioctl.o wil6210-y += fw.o wil6210-$(CONFIG_WIL6210_TRACING) += trace.o wil6210-y += wil_platform.o diff --git a/drivers/net/wireless/ath/wil6210/ioctl.c b/drivers/net/wireless/ath/wil6210/ioctl.c new file mode 100644 index 000000000000..e9c0673819c6 --- /dev/null +++ b/drivers/net/wireless/ath/wil6210/ioctl.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2014 Qualcomm Atheros, Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "wil6210.h" +#include + +#define wil_hex_dump_ioctl(prefix_str, buf, len) \ + print_hex_dump_debug("DBG[IOC ]" prefix_str, \ + DUMP_PREFIX_OFFSET, 16, 1, buf, len, true) +#define wil_dbg_ioctl(wil, fmt, arg...) wil_dbg(wil, "DBG[IOC ]" fmt, ##arg) + +static void __iomem *wil_ioc_addr(struct wil6210_priv *wil, uint32_t addr, + uint32_t size, enum wil_memio_op op) +{ + void __iomem *a; + u32 off; + + switch (op & wil_mmio_addr_mask) { + case wil_mmio_addr_linker: + a = wmi_buffer(wil, cpu_to_le32(addr)); + break; + case wil_mmio_addr_ahb: + a = wmi_addr(wil, addr); + break; + case wil_mmio_addr_bar: + a = wmi_addr(wil, addr + WIL6210_FW_HOST_OFF); + break; + default: + wil_err(wil, "Unsupported address mode, op = 0x%08x\n", op); + return NULL; + } + + off = a - wil->csr; + if (size >= WIL6210_MEM_SIZE - off) { + wil_err(wil, "Requested block does not fit into memory: " + "off = 0x%08x size = 0x%08x\n", off, size); + return NULL; + } + + return a; +} + +static int wil_ioc_memio_dword(struct wil6210_priv *wil, void __user *data) +{ + struct wil_memio io; + void __iomem *a; + bool need_copy = false; + + if (copy_from_user(&io, data, sizeof(io))) + return -EFAULT; + + wil_dbg_ioctl(wil, "IO: addr = 0x%08x val = 0x%08x op = 0x%08x\n", + io.addr, io.val, io.op); + + a = wil_ioc_addr(wil, io.addr, sizeof(u32), io.op); + if (!a) { + wil_err(wil, "invalid address 0x%08x, op = 0x%08x\n", io.addr, + io.op); + return -EINVAL; + } + /* operation */ + switch (io.op & wil_mmio_op_mask) { + case wil_mmio_read: + io.val = ioread32(a); + need_copy = true; + break; + case wil_mmio_write: + iowrite32(io.val, a); + wmb(); /* make sure write propagated to HW */ + break; + default: + wil_err(wil, "Unsupported operation, op = 0x%08x\n", io.op); + return -EINVAL; + } + + if (need_copy) { + wil_dbg_ioctl(wil, "IO done: addr = 0x%08x" + " val = 0x%08x op = 0x%08x\n", + io.addr, io.val, io.op); + if (copy_to_user(data, &io, sizeof(io))) + return -EFAULT; + } + + return 0; +} + +static int wil_ioc_memio_block(struct wil6210_priv *wil, void __user *data) +{ + struct wil_memio_block io; + void *block; + void __iomem *a; + int rc = 0; + + if (copy_from_user(&io, data, sizeof(io))) + return -EFAULT; + + wil_dbg_ioctl(wil, "IO: addr = 0x%08x size = 0x%08x op = 0x%08x\n", + io.addr, io.size, io.op); + + /* size */ + if (io.size % 4) { + wil_err(wil, "size is not multiple of 4: 0x%08x\n", io.size); + return -EINVAL; + } + + a = wil_ioc_addr(wil, io.addr, io.size, io.op); + if (!a) { + wil_err(wil, "invalid address 0x%08x, op = 0x%08x\n", io.addr, + io.op); + return -EINVAL; + } + + block = kmalloc(io.size, GFP_USER); + if (!block) + return -ENOMEM; + + /* operation */ + switch (io.op & wil_mmio_op_mask) { + case wil_mmio_read: + wil_memcpy_fromio_32(block, a, io.size); + wil_hex_dump_ioctl("Read ", block, io.size); + if (copy_to_user(io.block, block, io.size)) { + rc = -EFAULT; + goto out_free; + } + break; + case wil_mmio_write: + if (copy_from_user(block, io.block, io.size)) { + rc = -EFAULT; + goto out_free; + } + wil_memcpy_toio_32(a, block, io.size); + wmb(); /* make sure write propagated to HW */ + wil_hex_dump_ioctl("Write ", block, io.size); + break; + default: + wil_err(wil, "Unsupported operation, op = 0x%08x\n", io.op); + rc = -EINVAL; + break; + } + +out_free: + kfree(block); + return rc; +} + +int wil_ioctl(struct wil6210_priv *wil, void __user *data, int cmd) +{ + switch (cmd) { + case WIL_IOCTL_MEMIO: + return wil_ioc_memio_dword(wil, data); + case WIL_IOCTL_MEMIO_BLOCK: + return wil_ioc_memio_block(wil, data); + default: + wil_dbg_ioctl(wil, "Unsupported IOCTL 0x%04x\n", cmd); + return -ENOIOCTLCMD; + } +} diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c index c3f0ddfaa5da..239965106c05 100644 --- a/drivers/net/wireless/ath/wil6210/netdev.c +++ b/drivers/net/wireless/ath/wil6210/netdev.c @@ -52,6 +52,17 @@ static int wil_change_mtu(struct net_device *ndev, int new_mtu) return 0; } +static int wil_do_ioctl(struct net_device *ndev, struct ifreq *ifr, int cmd) +{ + struct wil6210_priv *wil = ndev_to_wil(ndev); + + int ret = wil_ioctl(wil, ifr->ifr_data, cmd); + + wil_dbg_misc(wil, "ioctl(0x%04x) -> %d\n", cmd, ret); + + return ret; +} + static const struct net_device_ops wil_netdev_ops = { .ndo_open = wil_open, .ndo_stop = wil_stop, @@ -59,6 +70,7 @@ static const struct net_device_ops wil_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = wil_change_mtu, + .ndo_do_ioctl = wil_do_ioctl, }; static int wil6210_netdev_poll_rx(struct napi_struct *napi, int budget) diff --git a/drivers/net/wireless/ath/wil6210/wil6210.h b/drivers/net/wireless/ath/wil6210/wil6210.h index 2991609885f7..61cceca155a2 100644 --- a/drivers/net/wireless/ath/wil6210/wil6210.h +++ b/drivers/net/wireless/ath/wil6210/wil6210.h @@ -595,5 +595,7 @@ void wil6210_unmask_irq_rx(struct wil6210_priv *wil); int wil_iftype_nl2wmi(enum nl80211_iftype type); +int wil_ioctl(struct wil6210_priv *wil, void __user *data, int cmd); int wil_request_firmware(struct wil6210_priv *wil, const char *name); + #endif /* __WIL6210_H__ */ diff --git a/include/uapi/linux/wil6210_uapi.h b/include/uapi/linux/wil6210_uapi.h new file mode 100644 index 000000000000..6a3cddd156c4 --- /dev/null +++ b/include/uapi/linux/wil6210_uapi.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2014 Qualcomm Atheros, Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __WIL6210_UAPI_H__ +#define __WIL6210_UAPI_H__ + +#if !defined(__KERNEL__) +#define __user +#endif + +#include + +/* Numbers SIOCDEVPRIVATE and SIOCDEVPRIVATE + 1 + * are used by Android devices to implement PNO (preferred network offload). + * Albeit it is temporary solution, use different numbers to avoid conflicts + */ + +/** + * Perform 32-bit I/O operation to the card memory + * + * User code should arrange data in memory like this: + * + * struct wil_memio io; + * struct ifreq ifr = { + * .ifr_data = &io, + * }; + */ +#define WIL_IOCTL_MEMIO (SIOCDEVPRIVATE + 2) + +/** + * Perform block I/O operation to the card memory + * + * User code should arrange data in memory like this: + * + * void *buf; + * struct wil_memio_block io = { + * .block = buf, + * }; + * struct ifreq ifr = { + * .ifr_data = &io, + * }; + */ +#define WIL_IOCTL_MEMIO_BLOCK (SIOCDEVPRIVATE + 3) + +/** + * operation to perform + * + * @wil_mmio_op_mask - bits defining operation, + * @wil_mmio_addr_mask - bits defining addressing mode + */ +enum wil_memio_op { + wil_mmio_read = 0, + wil_mmio_write = 1, + wil_mmio_op_mask = 0xff, + wil_mmio_addr_linker = 0 << 8, + wil_mmio_addr_ahb = 1 << 8, + wil_mmio_addr_bar = 2 << 8, + wil_mmio_addr_mask = 0xff00, +}; + +struct wil_memio { + uint32_t op; /* enum wil_memio_op */ + uint32_t addr; /* should be 32-bit aligned */ + uint32_t val; +}; + +struct wil_memio_block { + uint32_t op; /* enum wil_memio_op */ + uint32_t addr; /* should be 32-bit aligned */ + uint32_t size; /* should be multiple of 4 */ + void __user *block; /* block address */ +}; + +#endif /* __WIL6210_UAPI_H__ */ -- cgit v1.2.3 From 7c9e7a6fe11c8dc5b3b9d0e889dde73347247584 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:05 -0700 Subject: target: Add a user-passthrough backstore Add a LIO storage engine that presents commands to userspace for execution. This would allow more complex backstores to be implemented out-of-kernel, and also make experimentation a-la FUSE (but at the SCSI level -- "SUSE"?) possible. It uses a mmap()able UIO device per LUN to share a command ring and data area. The commands are raw SCSI CDBs and iovs for in/out data. The command ring is also reused for returning scsi command status and optional sense data. This implementation is based on Shaohua Li's earlier version but heavily modified. Differences include: * Shared memory allocated by kernel, not locked-down user pages * Single ring for command request and response * Offsets instead of embedded pointers * Generic SCSI CDB passthrough instead of per-cmd specialization in ring format. * Uses UIO device instead of anon_file passed in mailbox. * Optional in-kernel handling of some commands. The main reason for these differences is to permit greater resiliency if the user process dies or hangs. Things not yet implemented (on purpose): * Zero copy. The data area is flexible enough to allow page flipping or backend-allocated pages to be used by fabrics, but it's not clear these are performance wins. Can come later. * Out-of-order command completion by userspace. Possible to add by just allowing userspace to change cmd_id in rsp cmd entries, but currently not supported. * No locks between kernel cmd submission and completion routines. Sounds like it's possible, but this can come later. * Sparse allocation of mmaped area. Current code vmallocs the whole thing. If the mapped area was larger and not fully mapped then the driver would have more freedom to change cmd and data area sizes based on demand. Current code open issues: * The use of idrs may be overkill -- we maybe can replace them with a simple counter to generate cmd_ids, and a hash table to get a cmd_id's associated pointer. * Use of a free-running counter for cmd ring instead of explicit modulo math. This would require power-of-2 cmd ring size. (Add kconfig depends NET - Randy) Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/Kconfig | 7 + drivers/target/Makefile | 1 + drivers/target/target_core_transport.c | 4 + drivers/target/target_core_user.c | 1163 ++++++++++++++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/target_core_user.h | 142 ++++ 6 files changed, 1318 insertions(+) create mode 100644 drivers/target/target_core_user.c create mode 100644 include/uapi/linux/target_core_user.h (limited to 'include/uapi') diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index dc2d84ac5a0e..81d44c477a5b 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -31,6 +31,13 @@ config TCM_PSCSI Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered passthrough access to Linux/SCSI device +config TCM_USER + tristate "TCM/USER Subsystem Plugin for Linux" + depends on UIO && NET + help + Say Y here to enable the TCM/USER subsystem plugin for a userspace + process to handle requests + source "drivers/target/loopback/Kconfig" source "drivers/target/tcm_fc/Kconfig" source "drivers/target/iscsi/Kconfig" diff --git a/drivers/target/Makefile b/drivers/target/Makefile index 85b012d2f89b..bbb4a7d638ef 100644 --- a/drivers/target/Makefile +++ b/drivers/target/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TARGET_CORE) += target_core_mod.o obj-$(CONFIG_TCM_IBLOCK) += target_core_iblock.o obj-$(CONFIG_TCM_FILEIO) += target_core_file.o obj-$(CONFIG_TCM_PSCSI) += target_core_pscsi.o +obj-$(CONFIG_TCM_USER) += target_core_user.o # Fabric modules obj-$(CONFIG_LOOPBACK_TARGET) += loopback/ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 9700ea125268..9ea0d5f03f7a 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -232,6 +232,10 @@ void transport_subsystem_check_init(void) if (ret != 0) pr_err("Unable to load target_core_pscsi\n"); + ret = request_module("target_core_user"); + if (ret != 0) + pr_err("Unable to load target_core_user\n"); + sub_api_initialized = 1; } diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c new file mode 100644 index 000000000000..6608ecf94570 --- /dev/null +++ b/drivers/target/target_core_user.c @@ -0,0 +1,1163 @@ +/* + * Copyright (C) 2013 Shaohua Li + * Copyright (C) 2014 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Define a shared-memory interface for LIO to pass SCSI commands and + * data to userspace for processing. This is to allow backends that + * are too complex for in-kernel support to be possible. + * + * It uses the UIO framework to do a lot of the device-creation and + * introspection work for us. + * + * See the .h file for how the ring is laid out. Note that while the + * command ring is defined, the particulars of the data area are + * not. Offset values in the command entry point to other locations + * internal to the mmap()ed area. There is separate space outside the + * command ring for data buffers. This leaves maximum flexibility for + * moving buffer allocations, or even page flipping or other + * allocation techniques, without altering the command ring layout. + * + * SECURITY: + * The user process must be assumed to be malicious. There's no way to + * prevent it breaking the command ring protocol if it wants, but in + * order to prevent other issues we must only ever read *data* from + * the shared memory area, not offsets or sizes. This applies to + * command ring entries as well as the mailbox. Extra code needed for + * this may have a 'UAM' comment. + */ + + +#define TCMU_TIME_OUT (30 * MSEC_PER_SEC) + +#define CMDR_SIZE (16 * 4096) +#define DATA_SIZE (257 * 4096) + +#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE) + +static struct device *tcmu_root_device; + +struct tcmu_hba { + u32 host_id; +}; + +/* User wants all cmds or just some */ +enum passthru_level { + TCMU_PASS_ALL = 0, + TCMU_PASS_IO, + TCMU_PASS_INVALID, +}; + +#define TCMU_CONFIG_LEN 256 + +struct tcmu_dev { + struct se_device se_dev; + + char *name; + struct se_hba *hba; + +#define TCMU_DEV_BIT_OPEN 0 +#define TCMU_DEV_BIT_BROKEN 1 + unsigned long flags; + enum passthru_level pass_level; + + struct uio_info uio_info; + + struct tcmu_mailbox *mb_addr; + size_t dev_size; + u32 cmdr_size; + u32 cmdr_last_cleaned; + /* Offset of data ring from start of mb */ + size_t data_off; + size_t data_size; + /* Ring head + tail values. */ + /* Must add data_off and mb_addr to get the address */ + size_t data_head; + size_t data_tail; + + wait_queue_head_t wait_cmdr; + /* TODO should this be a mutex? */ + spinlock_t cmdr_lock; + + struct idr commands; + spinlock_t commands_lock; + + struct timer_list timeout; + + char dev_config[TCMU_CONFIG_LEN]; +}; + +#define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev, se_dev) + +#define CMDR_OFF sizeof(struct tcmu_mailbox) + +struct tcmu_cmd { + struct se_cmd *se_cmd; + struct tcmu_dev *tcmu_dev; + + uint16_t cmd_id; + + /* Can't use se_cmd->data_length when cleaning up expired cmds, because if + cmd has been completed then accessing se_cmd is off limits */ + size_t data_length; + + unsigned long deadline; + +#define TCMU_CMD_BIT_EXPIRED 0 + unsigned long flags; +}; + +static struct kmem_cache *tcmu_cmd_cache; + +/* multicast group */ +enum tcmu_multicast_groups { + TCMU_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group tcmu_mcgrps[] = { + [TCMU_MCGRP_CONFIG] = { .name = "config", }, +}; + +/* Our generic netlink family */ +static struct genl_family tcmu_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "TCM-USER", + .version = 1, + .maxattr = TCMU_ATTR_MAX, + .mcgrps = tcmu_mcgrps, + .n_mcgrps = ARRAY_SIZE(tcmu_mcgrps), +}; + +static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int cmd_id; + + tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_KERNEL); + if (!tcmu_cmd) + return NULL; + + tcmu_cmd->se_cmd = se_cmd; + tcmu_cmd->tcmu_dev = udev; + tcmu_cmd->data_length = se_cmd->data_length; + + tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT); + + idr_preload(GFP_KERNEL); + spin_lock_irq(&udev->commands_lock); + cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0, + USHRT_MAX, GFP_NOWAIT); + spin_unlock_irq(&udev->commands_lock); + idr_preload_end(); + + if (cmd_id < 0) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + tcmu_cmd->cmd_id = cmd_id; + + return tcmu_cmd; +} + +static inline void tcmu_flush_dcache_range(void *vaddr, size_t size) +{ + unsigned long offset = (unsigned long) vaddr & ~PAGE_MASK; + + size = round_up(size+offset, PAGE_SIZE); + vaddr -= offset; + + while (size) { + flush_dcache_page(virt_to_page(vaddr)); + size -= PAGE_SIZE; + } +} + +/* + * Some ring helper functions. We don't assume size is a power of 2 so + * we can't use circ_buf.h. + */ +static inline size_t spc_used(size_t head, size_t tail, size_t size) +{ + int diff = head - tail; + + if (diff >= 0) + return diff; + else + return size + diff; +} + +static inline size_t spc_free(size_t head, size_t tail, size_t size) +{ + /* Keep 1 byte unused or we can't tell full from empty */ + return (size - spc_used(head, tail, size) - 1); +} + +static inline size_t head_to_end(size_t head, size_t size) +{ + return size - head; +} + +#define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size) + +/* + * We can't queue a command until we have space available on the cmd ring *and* space + * space avail on the data ring. + * + * Called with ring lock held. + */ +static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_needed, size_t data_needed) +{ + struct tcmu_mailbox *mb = udev->mb_addr; + size_t space; + u32 cmd_head; + + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + + space = spc_free(cmd_head, udev->cmdr_last_cleaned, udev->cmdr_size); + if (space < cmd_needed) { + pr_debug("no cmd space: %u %u %u\n", cmd_head, + udev->cmdr_last_cleaned, udev->cmdr_size); + return false; + } + + space = spc_free(udev->data_head, udev->data_tail, udev->data_size); + if (space < data_needed) { + pr_debug("no data space: %zu %zu %zu\n", udev->data_head, + udev->data_tail, udev->data_size); + return false; + } + + return true; +} + +static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t base_command_size, command_size; + size_t cmdr_space_needed; + struct tcmu_mailbox *mb; + size_t pad_size; + struct tcmu_cmd_entry *entry; + int i; + struct scatterlist *sg; + struct iovec *iov; + int iov_cnt = 0; + uint32_t cmd_head; + uint64_t cdb_off; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) + return -EINVAL; + + /* + * Must be a certain minimum size for response sense info, but + * also may be larger if the iov array is large. + * + * iovs = sgl_nents+1, for end-of-ring case, plus another 1 + * b/c size == offsetof one-past-element. + */ + base_command_size = max(offsetof(struct tcmu_cmd_entry, + req.iov[se_cmd->t_data_nents + 2]), + sizeof(struct tcmu_cmd_entry)); + command_size = base_command_size + + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE); + + WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); + + spin_lock_irq(&udev->cmdr_lock); + + mb = udev->mb_addr; + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + if ((command_size > (udev->cmdr_size / 2)) + || tcmu_cmd->data_length > (udev->data_size - 1)) + pr_warn("TCMU: Request of size %zu/%zu may be too big for %u/%zu " + "cmd/data ring buffers\n", command_size, tcmu_cmd->data_length, + udev->cmdr_size, udev->data_size); + + /* + * Cmd end-of-ring space is too small so we need space for a NOP plus + * original cmd - cmds are internally contiguous. + */ + if (head_to_end(cmd_head, udev->cmdr_size) >= command_size) + pad_size = 0; + else + pad_size = head_to_end(cmd_head, udev->cmdr_size); + cmdr_space_needed = command_size + pad_size; + + while (!is_ring_space_avail(udev, cmdr_space_needed, tcmu_cmd->data_length)) { + int ret; + DEFINE_WAIT(__wait); + + prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE); + + pr_debug("sleeping for ring space\n"); + spin_unlock_irq(&udev->cmdr_lock); + ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); + finish_wait(&udev->wait_cmdr, &__wait); + if (!ret) { + pr_warn("tcmu: command timed out\n"); + return -ETIMEDOUT; + } + + spin_lock_irq(&udev->cmdr_lock); + + /* We dropped cmdr_lock, cmd_head is stale */ + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + } + + if (pad_size) { + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD); + tcmu_hdr_set_len(&entry->hdr, pad_size); + + UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + WARN_ON(cmd_head != 0); + } + + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_CMD); + tcmu_hdr_set_len(&entry->hdr, command_size); + entry->cmd_id = tcmu_cmd->cmd_id; + + /* + * Fix up iovecs, and handle if allocation in data ring wrapped. + */ + iov = &entry->req.iov[0]; + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_head, udev->data_size)); + void *from = kmap_atomic(sg_page(sg)) + sg->offset; + void *to = (void *) mb + udev->data_off + udev->data_head; + + if (tcmu_cmd->se_cmd->data_direction == DMA_TO_DEVICE) { + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + /* Even iov_base is relative to mb_addr */ + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + + /* Uh oh, we wrapped the buffer. Must split sg across 2 iovs. */ + if (sg->length != copy_bytes) { + from += copy_bytes; + copy_bytes = sg->length - copy_bytes; + + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + + if (se_cmd->data_direction == DMA_TO_DEVICE) { + to = (void *) mb + udev->data_off + udev->data_head; + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + } + + kunmap_atomic(from); + } + entry->req.iov_cnt = iov_cnt; + + /* All offsets relative to mb_addr, not start of entry! */ + cdb_off = CMDR_OFF + cmd_head + base_command_size; + memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb)); + entry->req.cdb_off = cdb_off; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size); + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + spin_unlock_irq(&udev->cmdr_lock); + + /* TODO: only if FLUSH and FUA? */ + uio_event_notify(&udev->uio_info); + + mod_timer(&udev->timeout, + round_jiffies_up(jiffies + msecs_to_jiffies(TCMU_TIME_OUT))); + + return 0; +} + +static int tcmu_queue_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int ret; + + tcmu_cmd = tcmu_alloc_cmd(se_cmd); + if (!tcmu_cmd) + return -ENOMEM; + + ret = tcmu_queue_cmd_ring(tcmu_cmd); + if (ret < 0) { + pr_err("TCMU: Could not queue command\n"); + spin_lock_irq(&udev->commands_lock); + idr_remove(&udev->commands, tcmu_cmd->cmd_id); + spin_unlock_irq(&udev->commands_lock); + + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + } + + return ret; +} + +static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry) +{ + struct se_cmd *se_cmd = cmd->se_cmd; + struct tcmu_dev *udev = cmd->tcmu_dev; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { + /* cmd has been completed already from timeout, just reclaim data + ring space */ + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + return; + } + + if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { + memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer, + se_cmd->scsi_sense_length); + + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } + else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + struct scatterlist *sg; + int i; + + /* It'd be easier to look at entry's iovec again, but UAM */ + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes; + void *to; + void *from; + + copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_tail, udev->data_size)); + + to = kmap_atomic(sg_page(sg)) + sg->offset; + WARN_ON(sg->length + sg->offset > PAGE_SIZE); + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + + /* Uh oh, wrapped the data buffer for this sg's data */ + if (sg->length != copy_bytes) { + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + WARN_ON(udev->data_tail); + to += copy_bytes; + copy_bytes = sg->length - copy_bytes; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + } + + kunmap_atomic(to); + } + + } else if (se_cmd->data_direction == DMA_TO_DEVICE) { + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } else { + pr_warn("TCMU: data direction was %d!\n", se_cmd->data_direction); + } + + target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); +} + +static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) +{ + struct tcmu_mailbox *mb; + LIST_HEAD(cpl_cmds); + unsigned long flags; + int handled = 0; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { + pr_err("ring broken, not handling completions\n"); + return 0; + } + + spin_lock_irqsave(&udev->cmdr_lock, flags); + + mb = udev->mb_addr; + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) { + + struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned; + struct tcmu_cmd *cmd; + + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + if (tcmu_hdr_get_op(&entry->hdr) == TCMU_OP_PAD) { + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + continue; + } + WARN_ON(tcmu_hdr_get_op(&entry->hdr) != TCMU_OP_CMD); + + spin_lock(&udev->commands_lock); + cmd = idr_find(&udev->commands, entry->cmd_id); + if (cmd) + idr_remove(&udev->commands, cmd->cmd_id); + spin_unlock(&udev->commands_lock); + + if (!cmd) { + pr_err("cmd_id not found, ring is broken\n"); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + break; + } + + tcmu_handle_completion(cmd, entry); + + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + + handled++; + } + + if (mb->cmd_tail == mb->cmd_head) + del_timer(&udev->timeout); /* no more pending cmds */ + + spin_unlock_irqrestore(&udev->cmdr_lock, flags); + + wake_up(&udev->wait_cmdr); + + return handled; +} + +static int tcmu_check_expired_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + + if (!time_after(cmd->deadline, jiffies)) + return 0; + + set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); + target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); + + return 0; +} + +static void tcmu_device_timedout(unsigned long data) +{ + struct tcmu_dev *udev = (struct tcmu_dev *)data; + unsigned long flags; + int handled; + + handled = tcmu_handle_completions(udev); + + pr_warn("%d completions handled from timeout\n", handled); + + spin_lock_irqsave(&udev->commands_lock, flags); + idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL); + spin_unlock_irqrestore(&udev->commands_lock, flags); + + /* + * We don't need to wakeup threads on wait_cmdr since they have their + * own timeout. + */ +} + +static int tcmu_attach_hba(struct se_hba *hba, u32 host_id) +{ + struct tcmu_hba *tcmu_hba; + + tcmu_hba = kzalloc(sizeof(struct tcmu_hba), GFP_KERNEL); + if (!tcmu_hba) + return -ENOMEM; + + tcmu_hba->host_id = host_id; + hba->hba_ptr = tcmu_hba; + + return 0; +} + +static void tcmu_detach_hba(struct se_hba *hba) +{ + kfree(hba->hba_ptr); + hba->hba_ptr = NULL; +} + +static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) +{ + struct tcmu_dev *udev; + + udev = kzalloc(sizeof(struct tcmu_dev), GFP_KERNEL); + if (!udev) + return NULL; + + udev->name = kstrdup(name, GFP_KERNEL); + if (!udev->name) { + kfree(udev); + return NULL; + } + + udev->hba = hba; + + init_waitqueue_head(&udev->wait_cmdr); + spin_lock_init(&udev->cmdr_lock); + + idr_init(&udev->commands); + spin_lock_init(&udev->commands_lock); + + setup_timer(&udev->timeout, tcmu_device_timedout, + (unsigned long)udev); + + udev->pass_level = TCMU_PASS_ALL; + + return &udev->se_dev; +} + +static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) +{ + struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info); + + tcmu_handle_completions(tcmu_dev); + + return 0; +} + +/* + * mmap code from uio.c. Copied here because we want to hook mmap() + * and this stuff must come along. + */ +static int tcmu_find_mem_index(struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + + if (vma->vm_pgoff < MAX_UIO_MAPS) { + if (info->mem[vma->vm_pgoff].size == 0) + return -1; + return (int)vma->vm_pgoff; + } + return -1; +} + +static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + struct page *page; + unsigned long offset; + void *addr; + + int mi = tcmu_find_mem_index(vma); + if (mi < 0) + return VM_FAULT_SIGBUS; + + /* + * We need to subtract mi because userspace uses offset = N*PAGE_SIZE + * to use mem[N]. + */ + offset = (vmf->pgoff - mi) << PAGE_SHIFT; + + addr = (void *)(unsigned long)info->mem[mi].addr + offset; + if (info->mem[mi].memtype == UIO_MEM_LOGICAL) + page = virt_to_page(addr); + else + page = vmalloc_to_page(addr); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct tcmu_vm_ops = { + .fault = tcmu_vma_fault, +}; + +static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &tcmu_vm_ops; + + vma->vm_private_data = udev; + + /* Ensure the mmap is exactly the right size */ + if (vma_pages(vma) != (TCMU_RING_SIZE >> PAGE_SHIFT)) + return -EINVAL; + + return 0; +} + +static int tcmu_open(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + /* O_EXCL not supported for char devs, so fake it? */ + if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags)) + return -EBUSY; + + pr_debug("open\n"); + + return 0; +} + +static int tcmu_release(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags); + + pr_debug("close\n"); + + return 0; +} + +static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor) +{ + struct sk_buff *skb; + void *msg_header; + int ret; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + msg_header = genlmsg_put(skb, 0, 0, &tcmu_genl_family, 0, cmd); + if (!msg_header) { + nlmsg_free(skb); + return -ENOMEM; + } + + ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name); + + ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor); + + ret = genlmsg_end(skb, msg_header); + if (ret < 0) { + nlmsg_free(skb); + return ret; + } + + ret = genlmsg_multicast(&tcmu_genl_family, skb, 0, + TCMU_MCGRP_CONFIG, GFP_KERNEL); + + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + + return ret; +} + +static int tcmu_configure_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + struct tcmu_hba *hba = udev->hba->hba_ptr; + struct uio_info *info; + struct tcmu_mailbox *mb; + size_t size; + size_t used; + int ret = 0; + char *str; + + info = &udev->uio_info; + + size = snprintf(NULL, 0, "tcm-user/%u/%s/%s", hba->host_id, udev->name, + udev->dev_config); + size += 1; /* for \0 */ + str = kmalloc(size, GFP_KERNEL); + if (!str) + return -ENOMEM; + + used = snprintf(str, size, "tcm-user/%u/%s", hba->host_id, udev->name); + + if (udev->dev_config[0]) + snprintf(str + used, size - used, "/%s", udev->dev_config); + + info->name = str; + + udev->mb_addr = vzalloc(TCMU_RING_SIZE); + if (!udev->mb_addr) { + ret = -ENOMEM; + goto err_vzalloc; + } + + /* mailbox fits in first part of CMDR space */ + udev->cmdr_size = CMDR_SIZE - CMDR_OFF; + udev->data_off = CMDR_SIZE; + udev->data_size = TCMU_RING_SIZE - CMDR_SIZE; + + mb = udev->mb_addr; + mb->version = 1; + mb->cmdr_off = CMDR_OFF; + mb->cmdr_size = udev->cmdr_size; + + WARN_ON(!PAGE_ALIGNED(udev->data_off)); + WARN_ON(udev->data_size % PAGE_SIZE); + + info->version = "1"; + + info->mem[0].name = "tcm-user command & data buffer"; + info->mem[0].addr = (phys_addr_t) udev->mb_addr; + info->mem[0].size = TCMU_RING_SIZE; + info->mem[0].memtype = UIO_MEM_VIRTUAL; + + info->irqcontrol = tcmu_irqcontrol; + info->irq = UIO_IRQ_CUSTOM; + + info->mmap = tcmu_mmap; + info->open = tcmu_open; + info->release = tcmu_release; + + ret = uio_register_device(tcmu_root_device, info); + if (ret) + goto err_register; + + /* Other attributes can be configured in userspace */ + dev->dev_attrib.hw_block_size = 512; + dev->dev_attrib.hw_max_sectors = 128; + dev->dev_attrib.hw_queue_depth = 128; + + ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + if (ret) + goto err_netlink; + + return 0; + +err_netlink: + uio_unregister_device(&udev->uio_info); +err_register: + vfree(udev->mb_addr); +err_vzalloc: + kfree(info->name); + + return ret; +} + +static int tcmu_check_pending_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + return -EINVAL; +} + +static void tcmu_free_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + int i; + + del_timer_sync(&udev->timeout); + + vfree(udev->mb_addr); + + /* Upper layer should drain all requests before calling this */ + spin_lock_irq(&udev->commands_lock); + i = idr_for_each(&udev->commands, tcmu_check_pending_cmd, NULL); + idr_destroy(&udev->commands); + spin_unlock_irq(&udev->commands_lock); + WARN_ON(i); + + /* Device was configured */ + if (udev->uio_info.uio_dev) { + tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + + uio_unregister_device(&udev->uio_info); + kfree(udev->uio_info.name); + kfree(udev->name); + } + + kfree(udev); +} + +enum { + Opt_dev_config, Opt_dev_size, Opt_err, Opt_pass_level, +}; + +static match_table_t tokens = { + {Opt_dev_config, "dev_config=%s"}, + {Opt_dev_size, "dev_size=%u"}, + {Opt_pass_level, "pass_level=%u"}, + {Opt_err, NULL} +}; + +static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, + const char *page, ssize_t count) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + char *orig, *ptr, *opts, *arg_p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0, token; + int arg; + + opts = kstrdup(page, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + orig = opts; + + while ((ptr = strsep(&opts, ",\n")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, tokens, args); + switch (token) { + case Opt_dev_config: + if (match_strlcpy(udev->dev_config, &args[0], + TCMU_CONFIG_LEN) == 0) { + ret = -EINVAL; + break; + } + pr_debug("TCMU: Referencing Path: %s\n", udev->dev_config); + break; + case Opt_dev_size: + arg_p = match_strdup(&args[0]); + if (!arg_p) { + ret = -ENOMEM; + break; + } + ret = kstrtoul(arg_p, 0, (unsigned long *) &udev->dev_size); + kfree(arg_p); + if (ret < 0) + pr_err("kstrtoul() failed for dev_size=\n"); + break; + case Opt_pass_level: + match_int(args, &arg); + if (arg >= TCMU_PASS_INVALID) { + pr_warn("TCMU: Invalid pass_level: %d\n", arg); + break; + } + + pr_debug("TCMU: Setting pass_level to %d\n", arg); + udev->pass_level = arg; + break; + default: + break; + } + } + + kfree(orig); + return (!ret) ? count : ret; +} + +static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + ssize_t bl = 0; + + bl = sprintf(b + bl, "Config: %s ", + udev->dev_config[0] ? udev->dev_config : "NULL"); + bl += sprintf(b + bl, "Size: %zu PassLevel: %u\n", + udev->dev_size, udev->pass_level); + + return bl; +} + +static sector_t tcmu_get_blocks(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + + return div_u64(udev->dev_size - dev->dev_attrib.block_size, + dev->dev_attrib.block_size); +} + +static sense_reason_t +tcmu_execute_rw(struct se_cmd *se_cmd, struct scatterlist *sgl, u32 sgl_nents, + enum dma_data_direction data_direction) +{ + int ret; + + ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static sense_reason_t +tcmu_pass_op(struct se_cmd *se_cmd) +{ + int ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static struct sbc_ops tcmu_sbc_ops = { + .execute_rw = tcmu_execute_rw, + .execute_sync_cache = tcmu_pass_op, + .execute_write_same = tcmu_pass_op, + .execute_write_same_unmap = tcmu_pass_op, + .execute_unmap = tcmu_pass_op, +}; + +static sense_reason_t +tcmu_parse_cdb(struct se_cmd *cmd) +{ + unsigned char *cdb = cmd->t_task_cdb; + struct tcmu_dev *udev = TCMU_DEV(cmd->se_dev); + sense_reason_t ret; + + switch (udev->pass_level) { + case TCMU_PASS_ALL: + /* We're just like pscsi, then */ + /* + * For REPORT LUNS we always need to emulate the response, for everything + * else, pass it up. + */ + switch (cdb[0]) { + case REPORT_LUNS: + cmd->execute_cmd = spc_emulate_report_luns; + break; + case READ_6: + case READ_10: + case READ_12: + case READ_16: + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + case WRITE_VERIFY: + cmd->se_cmd_flags |= SCF_SCSI_DATA_CDB; + /* FALLTHROUGH */ + default: + cmd->execute_cmd = tcmu_pass_op; + } + ret = TCM_NO_SENSE; + break; + case TCMU_PASS_IO: + ret = sbc_parse_cdb(cmd, &tcmu_sbc_ops); + break; + default: + pr_err("Unknown tcm-user pass level %d\n", udev->pass_level); + ret = TCM_CHECK_CONDITION_ABORT_CMD; + } + + return ret; +} + +static struct se_subsystem_api tcmu_template = { + .name = "user", + .inquiry_prod = "USER", + .inquiry_rev = TCMU_VERSION, + .owner = THIS_MODULE, + .transport_type = TRANSPORT_PLUGIN_VHBA_PDEV, + .attach_hba = tcmu_attach_hba, + .detach_hba = tcmu_detach_hba, + .alloc_device = tcmu_alloc_device, + .configure_device = tcmu_configure_device, + .free_device = tcmu_free_device, + .parse_cdb = tcmu_parse_cdb, + .set_configfs_dev_params = tcmu_set_configfs_dev_params, + .show_configfs_dev_params = tcmu_show_configfs_dev_params, + .get_device_type = sbc_get_device_type, + .get_blocks = tcmu_get_blocks, +}; + +static int __init tcmu_module_init(void) +{ + int ret; + + BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0); + + tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache", + sizeof(struct tcmu_cmd), + __alignof__(struct tcmu_cmd), + 0, NULL); + if (!tcmu_cmd_cache) + return -ENOMEM; + + tcmu_root_device = root_device_register("tcm_user"); + if (IS_ERR(tcmu_root_device)) { + ret = PTR_ERR(tcmu_root_device); + goto out_free_cache; + } + + ret = genl_register_family(&tcmu_genl_family); + if (ret < 0) { + goto out_unreg_device; + } + + ret = transport_subsystem_register(&tcmu_template); + if (ret) + goto out_unreg_genl; + + return 0; + +out_unreg_genl: + genl_unregister_family(&tcmu_genl_family); +out_unreg_device: + root_device_unregister(tcmu_root_device); +out_free_cache: + kmem_cache_destroy(tcmu_cmd_cache); + + return ret; +} + +static void __exit tcmu_module_exit(void) +{ + transport_subsystem_release(&tcmu_template); + genl_unregister_family(&tcmu_genl_family); + root_device_unregister(tcmu_root_device); + kmem_cache_destroy(tcmu_cmd_cache); +} + +MODULE_DESCRIPTION("TCM USER subsystem plugin"); +MODULE_AUTHOR("Shaohua Li "); +MODULE_AUTHOR("Andy Grover "); +MODULE_LICENSE("GPL"); + +module_init(tcmu_module_init); +module_exit(tcmu_module_exit); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index be88166349a1..6ebd0d1faf2e 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -371,6 +371,7 @@ header-y += swab.h header-y += synclink.h header-y += sysctl.h header-y += sysinfo.h +header-y += target_core_user.h header-y += taskstats.h header-y += tcp.h header-y += tcp_metrics.h diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h new file mode 100644 index 000000000000..7dcfbe6771b1 --- /dev/null +++ b/include/uapi/linux/target_core_user.h @@ -0,0 +1,142 @@ +#ifndef __TARGET_CORE_USER_H +#define __TARGET_CORE_USER_H + +/* This header will be used by application too */ + +#include +#include + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +#define TCMU_VERSION "1.0" + +/* + * Ring Design + * ----------- + * + * The mmaped area is divided into three parts: + * 1) The mailbox (struct tcmu_mailbox, below) + * 2) The command ring + * 3) Everything beyond the command ring (data) + * + * The mailbox tells userspace the offset of the command ring from the + * start of the shared memory region, and how big the command ring is. + * + * The kernel passes SCSI commands to userspace by putting a struct + * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking + * userspace via uio's interrupt mechanism. + * + * tcmu_cmd_entry contains a header. If the header type is PAD, + * userspace should skip hdr->length bytes (mod cmdr_size) to find the + * next cmd_entry. + * + * Otherwise, the entry will contain offsets into the mmaped area that + * contain the cdb and data buffers -- the latter accessible via the + * iov array. iov addresses are also offsets into the shared area. + * + * When userspace is completed handling the command, set + * entry->rsp.scsi_status, fill in rsp.sense_buffer if appropriate, + * and also set mailbox->cmd_tail equal to the old cmd_tail plus + * hdr->length, mod cmdr_size. If cmd_tail doesn't equal cmd_head, it + * should process the next packet the same way, and so on. + */ + +#define TCMU_MAILBOX_VERSION 1 +#define ALIGN_SIZE 64 /* Should be enough for most CPUs */ + +struct tcmu_mailbox { + __u16 version; + __u16 flags; + __u32 cmdr_off; + __u32 cmdr_size; + + __u32 cmd_head; + + /* Updated by user. On its own cacheline */ + __u32 cmd_tail __attribute__((__aligned__(ALIGN_SIZE))); + +} __packed; + +enum tcmu_opcode { + TCMU_OP_PAD = 0, + TCMU_OP_CMD, +}; + +/* + * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode. + */ +struct tcmu_cmd_entry_hdr { + __u32 len_op; +} __packed; + +#define TCMU_OP_MASK 0x7 + +static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op) +{ + hdr->len_op &= ~TCMU_OP_MASK; + hdr->len_op |= (op & TCMU_OP_MASK); +} + +static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & ~TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) +{ + hdr->len_op &= TCMU_OP_MASK; + hdr->len_op |= len; +} + +/* Currently the same as SCSI_SENSE_BUFFERSIZE */ +#define TCMU_SENSE_BUFFERSIZE 96 + +struct tcmu_cmd_entry { + struct tcmu_cmd_entry_hdr hdr; + + uint16_t cmd_id; + uint16_t __pad1; + + union { + struct { + uint64_t cdb_off; + uint64_t iov_cnt; + struct iovec iov[0]; + } req; + struct { + uint8_t scsi_status; + uint8_t __pad1; + uint16_t __pad2; + uint32_t __pad3; + char sense_buffer[TCMU_SENSE_BUFFERSIZE]; + } rsp; + }; + +} __packed; + +#define TCMU_OP_ALIGN_SIZE sizeof(uint64_t) + +enum tcmu_genl_cmd { + TCMU_CMD_UNSPEC, + TCMU_CMD_ADDED_DEVICE, + TCMU_CMD_REMOVED_DEVICE, + __TCMU_CMD_MAX, +}; +#define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) + +enum tcmu_genl_attr { + TCMU_ATTR_UNSPEC, + TCMU_ATTR_DEVICE, + TCMU_ATTR_MINOR, + __TCMU_ATTR_MAX, +}; +#define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) + +#endif -- cgit v1.2.3 From 37dd0247797b168ad1cc7f5dbec825a1ee66535b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:09 -0700 Subject: gue: Receive side for Generic UDP Encapsulation This patch adds support receiving for GUE packets in the fou module. The fou module now supports direct foo-over-udp (no encapsulation header) and GUE. To support this a type parameter is added to the fou netlink parameters. For a GUE socket we define gue_udp_recv, gue_gro_receive, and gue_gro_complete to handle the specifics of the GUE protocol. Most of the code to manage and configure sockets is common with the fou. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/gue.h | 23 ++++++ include/uapi/linux/fou.h | 7 ++ net/ipv4/fou.c | 196 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 217 insertions(+), 9 deletions(-) create mode 100644 include/net/gue.h (limited to 'include/uapi') diff --git a/include/net/gue.h b/include/net/gue.h new file mode 100644 index 000000000000..b6c332788084 --- /dev/null +++ b/include/net/gue.h @@ -0,0 +1,23 @@ +#ifndef __NET_GUE_H +#define __NET_GUE_H + +struct guehdr { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 hlen:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + hlen:4; +#else +#error "Please fix " +#endif + __u8 next_hdr; + __u16 flags; + }; + __u32 word; + }; +}; + +#endif diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index e03376de453d..8df06894da23 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -13,6 +13,7 @@ enum { FOU_ATTR_PORT, /* u16 */ FOU_ATTR_AF, /* u8 */ FOU_ATTR_IPPROTO, /* u8 */ + FOU_ATTR_TYPE, /* u8 */ __FOU_ATTR_MAX, }; @@ -27,6 +28,12 @@ enum { __FOU_CMD_MAX, }; +enum { + FOU_ENCAP_UNSPEC, + FOU_ENCAP_DIRECT, + FOU_ENCAP_GUE, +}; + #define FOU_CMD_MAX (__FOU_CMD_MAX - 1) #endif /* _UAPI_LINUX_FOU_H */ diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 7e2126a31f2e..efa70ad44906 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ struct fou { }; struct fou_cfg { + u16 type; u8 protocol; struct udp_port_cfg udp_config; }; @@ -64,6 +66,41 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) sizeof(struct udphdr)); } +static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) +{ + struct fou *fou = fou_from_sock(sk); + size_t len; + struct guehdr *guehdr; + struct udphdr *uh; + + if (!fou) + return 1; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + if (!pskb_may_pull(skb, len)) + goto drop; + + uh = udp_hdr(skb); + guehdr = (struct guehdr *)&uh[1]; + + len += guehdr->hlen << 2; + if (!pskb_may_pull(skb, len)) + goto drop; + + if (guehdr->version != 0) + goto drop; + + if (guehdr->flags) { + /* No support yet */ + goto drop; + } + + return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len); +drop: + kfree_skb(skb); + return 0; +} + static struct sk_buff **fou_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -107,6 +144,112 @@ out_unlock: return err; } +static struct sk_buff **gue_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + const struct net_offload **offloads; + const struct net_offload *ops; + struct sk_buff **pp = NULL; + struct sk_buff *p; + u8 proto; + struct guehdr *guehdr; + unsigned int hlen, guehlen; + unsigned int off; + int flush = 1; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*guehdr); + guehdr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out; + } + + proto = guehdr->next_hdr; + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + goto out_unlock; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + hlen = off + guehlen; + if (skb_gro_header_hard(skb, hlen)) { + guehdr = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!guehdr)) + goto out_unlock; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + const struct guehdr *guehdr2; + + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + guehdr2 = (struct guehdr *)(p->data + off); + + /* Compare base GUE header to be equal (covers + * hlen, version, next_hdr, and flags. + */ + if (guehdr->word != guehdr2->word) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + /* Compare optional fields are the same. */ + if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], + guehdr->hlen << 2)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + skb_gro_pull(skb, guehlen); + + /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ + skb_gro_postpull_rcsum(skb, guehdr, guehlen); + + pp = ops->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int gue_gro_complete(struct sk_buff *skb, int nhoff) +{ + const struct net_offload **offloads; + struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); + const struct net_offload *ops; + unsigned int guehlen; + u8 proto; + int err = -ENOENT; + + proto = guehdr->next_hdr; + + guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); + + rcu_read_lock(); + offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; + ops = rcu_dereference(offloads[proto]); + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) + goto out_unlock; + + err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + +out_unlock: + rcu_read_unlock(); + return err; +} + static int fou_add_to_port_list(struct fou *fou) { struct fou *fout; @@ -142,6 +285,28 @@ static void fou_release(struct fou *fou) kfree(fou); } +static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->protocol = cfg->protocol; + fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; + fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + fou->udp_offloads.ipproto = cfg->protocol; + + return 0; +} + +static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) +{ + udp_sk(sk)->encap_rcv = gue_udp_recv; + fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; + fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; + fou->udp_offloads.port = cfg->udp_config.local_udp_port; + + return 0; +} + static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { @@ -164,10 +329,24 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk = sock->sk; - /* Mark socket as an encapsulation socket. See net/ipv4/udp.c */ - fou->protocol = cfg->protocol; - fou->port = cfg->udp_config.local_udp_port; - udp_sk(sk)->encap_rcv = fou_udp_recv; + fou->port = cfg->udp_config.local_udp_port; + + /* Initial for fou type */ + switch (cfg->type) { + case FOU_ENCAP_DIRECT: + err = fou_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + case FOU_ENCAP_GUE: + err = gue_encap_init(sk, fou, cfg); + if (err) + goto error; + break; + default: + err = -EINVAL; + goto error; + } udp_sk(sk)->encap_type = 1; udp_encap_enable(); @@ -179,11 +358,6 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk->sk_allocation = GFP_ATOMIC; - fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; - fou->udp_offloads.ipproto = cfg->protocol; - if (cfg->udp_config.family == AF_INET) { err = udp_add_offload(&fou->udp_offloads); if (err) @@ -240,6 +414,7 @@ static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = { [FOU_ATTR_PORT] = { .type = NLA_U16, }, [FOU_ATTR_AF] = { .type = NLA_U8, }, [FOU_ATTR_IPPROTO] = { .type = NLA_U8, }, + [FOU_ATTR_TYPE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, @@ -267,6 +442,9 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_IPPROTO]) cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); + if (info->attrs[FOU_ATTR_TYPE]) + cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); + return 0; } -- cgit v1.2.3 From bc1fc390e1728672b5b343b85185fcc1fe41043b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 3 Oct 2014 15:48:10 -0700 Subject: ip_tunnel: Add GUE support This patch allows configuring IPIP, sit, and GRE tunnels to use GUE. This is very similar to fou excpet that we need to insert the GUE header in addition to the UDP header on transmit. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_tunnel.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 7c832afdfa94..280d9e092283 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -64,6 +64,7 @@ enum { enum tunnel_encap_types { TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_FOU, + TUNNEL_ENCAP_GUE, }; #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index d9c9dc4ffeaf..0bb8e141eacc 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -56,6 +56,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -495,6 +496,8 @@ static int ip_encap_hlen(struct ip_tunnel_encap *e) return 0; case TUNNEL_ENCAP_FOU: return sizeof(struct udphdr); + case TUNNEL_ENCAP_GUE: + return sizeof(struct udphdr) + sizeof(struct guehdr); default: return -EINVAL; } @@ -546,6 +549,15 @@ static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, skb_reset_transport_header(skb); uh = udp_hdr(skb); + if (e->type == TUNNEL_ENCAP_GUE) { + struct guehdr *guehdr = (struct guehdr *)&uh[1]; + + guehdr->version = 0; + guehdr->hlen = 0; + guehdr->flags = 0; + guehdr->next_hdr = *protocol; + } + uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); @@ -565,6 +577,7 @@ int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, case TUNNEL_ENCAP_NONE: return 0; case TUNNEL_ENCAP_FOU: + case TUNNEL_ENCAP_GUE: return fou_build_header(skb, &t->encap, t->encap_hlen, protocol, fl4); default: -- cgit v1.2.3 From 86f1152b117a404229fd6f08ec3faca779f37b92 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 13 Aug 2014 13:53:43 -0500 Subject: dm: allow active and inactive tables to share dm_devs Until this change, when loading a new DM table, DM core would re-open all of the devices in the DM table. Now, DM core will avoid redundant device opens (and closes when destroying the old table) if the old table already has a device open using the same mode. This is achieved by managing reference counts on the table_devices that DM core now stores in the mapped_device structure (rather than in the dm_table structure). So a mapped_device's active and inactive dm_tables' dm_dev lists now just point to the dm_devs stored in the mapped_device's table_devices list. This improvement in DM core's device reference counting has the side-effect of fixing a long-standing limitation of the multipath target: a DM multipath table couldn't include any paths that were unusable (failed). For example: if all paths have failed and you add a new, working, path to the table; you can't use it since the table load would fail due to it still containing failed paths. Now a re-load of a multipath table can include failed devices and when those devices become active again they can be used instantly. The device list code in dm.c isn't a straight copy/paste from the code in dm-table.c, but it's very close (aside from some variable renames). One subtle difference is that find_table_device for the tables_devices list will only match devices with the same name and mode. This is because we don't want to upgrade a device's mode in the active table when an inactive table is loaded. Access to the mapped_device structure's tables_devices list requires a mutex (tables_devices_lock), so that tables cannot be created and destroyed concurrently. Signed-off-by: Benjamin Marzinski Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 2 +- drivers/md/dm-table.c | 102 ++++++++++++---------------------- drivers/md/dm.c | 126 ++++++++++++++++++++++++++++++++++++++++++ drivers/md/dm.h | 5 +- include/uapi/linux/dm-ioctl.h | 4 +- 5 files changed, 167 insertions(+), 72 deletions(-) (limited to 'include/uapi') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 51521429fb59..0be9381365d7 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1418,7 +1418,7 @@ static void retrieve_deps(struct dm_table *table, deps->count = count; count = 0; list_for_each_entry (dd, dm_table_get_devices(table), list) - deps->dev[count++] = huge_encode_dev(dd->dm_dev.bdev->bd_dev); + deps->dev[count++] = huge_encode_dev(dd->dm_dev->bdev->bd_dev); param->data_size = param->data_start + needed; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index f9c6cb8dbcf8..b2bd1ebf4562 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -210,15 +210,16 @@ int dm_table_create(struct dm_table **result, fmode_t mode, return 0; } -static void free_devices(struct list_head *devices) +static void free_devices(struct list_head *devices, struct mapped_device *md) { struct list_head *tmp, *next; list_for_each_safe(tmp, next, devices) { struct dm_dev_internal *dd = list_entry(tmp, struct dm_dev_internal, list); - DMWARN("dm_table_destroy: dm_put_device call missing for %s", - dd->dm_dev.name); + DMWARN("%s: dm_table_destroy: dm_put_device call missing for %s", + dm_device_name(md), dd->dm_dev->name); + dm_put_table_device(md, dd->dm_dev); kfree(dd); } } @@ -247,7 +248,7 @@ void dm_table_destroy(struct dm_table *t) vfree(t->highs); /* free the device list */ - free_devices(&t->devices); + free_devices(&t->devices, t->md); dm_free_md_mempools(t->mempools); @@ -262,52 +263,12 @@ static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) struct dm_dev_internal *dd; list_for_each_entry (dd, l, list) - if (dd->dm_dev.bdev->bd_dev == dev) + if (dd->dm_dev->bdev->bd_dev == dev) return dd; return NULL; } -/* - * Open a device so we can use it as a map destination. - */ -static int open_dev(struct dm_dev_internal *d, dev_t dev, - struct mapped_device *md) -{ - static char *_claim_ptr = "I belong to device-mapper"; - struct block_device *bdev; - - int r; - - BUG_ON(d->dm_dev.bdev); - - bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - - r = bd_link_disk_holder(bdev, dm_disk(md)); - if (r) { - blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); - return r; - } - - d->dm_dev.bdev = bdev; - return 0; -} - -/* - * Close a device that we've been using. - */ -static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) -{ - if (!d->dm_dev.bdev) - return; - - bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); - d->dm_dev.bdev = NULL; -} - /* * If possible, this checks an area of a destination device is invalid. */ @@ -386,19 +347,17 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, struct mapped_device *md) { int r; - struct dm_dev_internal dd_new, dd_old; + struct dm_dev *old_dev, *new_dev; - dd_new = dd_old = *dd; + old_dev = dd->dm_dev; - dd_new.dm_dev.mode |= new_mode; - dd_new.dm_dev.bdev = NULL; - - r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); + r = dm_get_table_device(md, dd->dm_dev->bdev->bd_dev, + dd->dm_dev->mode | new_mode, &new_dev); if (r) return r; - dd->dm_dev.mode |= new_mode; - close_dev(&dd_old, md); + dd->dm_dev = new_dev; + dm_put_table_device(md, old_dev); return 0; } @@ -440,27 +399,22 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, if (!dd) return -ENOMEM; - dd->dm_dev.mode = mode; - dd->dm_dev.bdev = NULL; - - if ((r = open_dev(dd, dev, t->md))) { + if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { kfree(dd); return r; } - format_dev_t(dd->dm_dev.name, dev); - atomic_set(&dd->count, 0); list_add(&dd->list, &t->devices); - } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { + } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) return r; } atomic_inc(&dd->count); - *result = &dd->dm_dev; + *result = dd->dm_dev; return 0; } EXPORT_SYMBOL(dm_get_device); @@ -505,11 +459,23 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ void dm_put_device(struct dm_target *ti, struct dm_dev *d) { - struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, - dm_dev); + int found = 0; + struct list_head *devices = &ti->table->devices; + struct dm_dev_internal *dd; + list_for_each_entry(dd, devices, list) { + if (dd->dm_dev == d) { + found = 1; + break; + } + } + if (!found) { + DMWARN("%s: device %s not in table devices list", + dm_device_name(ti->table->md), d->name); + return; + } if (atomic_dec_and_test(&dd->count)) { - close_dev(dd, ti->table->md); + dm_put_table_device(ti->table->md, d); list_del(&dd->list); kfree(dd); } @@ -906,7 +872,7 @@ static int dm_table_set_type(struct dm_table *t) /* Non-request-stackable devices can't be used for request-based dm */ devices = dm_table_get_devices(t); list_for_each_entry(dd, devices, list) { - if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { + if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev->bdev))) { DMWARN("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1043,7 +1009,7 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, struct gendisk *prev_disk = NULL, *template_disk = NULL; list_for_each_entry(dd, devices, list) { - template_disk = dd->dm_dev.bdev->bd_disk; + template_disk = dd->dm_dev->bdev->bd_disk; if (!blk_get_integrity(template_disk)) goto no_integrity; if (!match_all && !blk_integrity_is_initialized(template_disk)) @@ -1629,7 +1595,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) int r = 0; list_for_each_entry(dd, devices, list) { - struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); + struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); char b[BDEVNAME_SIZE]; if (likely(q)) @@ -1637,7 +1603,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), - bdevname(dd->dm_dev.bdev, b)); + bdevname(dd->dm_dev->bdev, b)); } list_for_each_entry(cb, &t->target_callbacks, list) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 56a2c74c9a3f..58f3927fd7cc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -142,6 +142,9 @@ struct mapped_device { */ struct dm_table *map; + struct list_head table_devices; + struct mutex table_devices_lock; + unsigned long flags; struct request_queue *queue; @@ -212,6 +215,12 @@ struct dm_md_mempools { struct bio_set *bs; }; +struct table_device { + struct list_head list; + atomic_t count; + struct dm_dev dm_dev; +}; + #define RESERVED_BIO_BASED_IOS 16 #define RESERVED_REQUEST_BASED_IOS 256 #define RESERVED_MAX_IOS 1024 @@ -669,6 +678,120 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +/* + * Open a table device so we can use it as a map destination. + */ +static int open_table_device(struct table_device *td, dev_t dev, + struct mapped_device *md) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + BUG_ON(td->dm_dev.bdev); + + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); + return r; + } + + td->dm_dev.bdev = bdev; + return 0; +} + +/* + * Close a table device that we've been using. + */ +static void close_table_device(struct table_device *td, struct mapped_device *md) +{ + if (!td->dm_dev.bdev) + return; + + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + td->dm_dev.bdev = NULL; +} + +static struct table_device *find_table_device(struct list_head *l, dev_t dev, + fmode_t mode) { + struct table_device *td; + + list_for_each_entry(td, l, list) + if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + return td; + + return NULL; +} + +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result) { + int r; + struct table_device *td; + + mutex_lock(&md->table_devices_lock); + td = find_table_device(&md->table_devices, dev, mode); + if (!td) { + td = kmalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + mutex_unlock(&md->table_devices_lock); + return -ENOMEM; + } + + td->dm_dev.mode = mode; + td->dm_dev.bdev = NULL; + + if ((r = open_table_device(td, dev, md))) { + mutex_unlock(&md->table_devices_lock); + kfree(td); + return r; + } + + format_dev_t(td->dm_dev.name, dev); + + atomic_set(&td->count, 0); + list_add(&td->list, &md->table_devices); + } + atomic_inc(&td->count); + mutex_unlock(&md->table_devices_lock); + + *result = &td->dm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(dm_get_table_device); + +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) +{ + struct table_device *td = container_of(d, struct table_device, dm_dev); + + mutex_lock(&md->table_devices_lock); + if (atomic_dec_and_test(&td->count)) { + close_table_device(td, md); + list_del(&td->list); + kfree(td); + } + mutex_unlock(&md->table_devices_lock); +} +EXPORT_SYMBOL(dm_put_table_device); + +static void free_table_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct table_device *td = list_entry(tmp, struct table_device, list); + + DMWARN("dm_destroy: %s still exists with %d references", + td->dm_dev.name, atomic_read(&td->count)); + kfree(td); + } +} + /* * Get the geometry associated with a dm device */ @@ -1944,12 +2067,14 @@ static struct mapped_device *alloc_dev(int minor) md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); + mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); + INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); md->queue = blk_alloc_queue(GFP_KERNEL); @@ -2035,6 +2160,7 @@ static void free_dev(struct mapped_device *md) blk_integrity_unregister(md->disk); del_gendisk(md->disk); cleanup_srcu_struct(&md->io_barrier); + free_table_devices(&md->table_devices); free_minor(minor); spin_lock(&_minor_lock); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e81d2152fa68..988c7fb7b145 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -44,7 +44,7 @@ struct dm_dev_internal { struct list_head list; atomic_t count; - struct dm_dev dm_dev; + struct dm_dev *dm_dev; }; struct dm_table; @@ -188,6 +188,9 @@ int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); sector_t dm_get_size(struct mapped_device *md); struct request_queue *dm_get_md_queue(struct mapped_device *md); +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result); +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); struct dm_stats *dm_get_stats(struct mapped_device *md); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index c8a4302093a3..3315ab21f728 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 27 +#define DM_VERSION_MINOR 28 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2013-10-30)" +#define DM_VERSION_EXTRA "-ioctl (2014-09-17)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From 67fa034194bf82a3d5ca841759d921297daa63ca Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:30 -0700 Subject: openvswitch: Add support for matching on OAM packets. Some tunnel formats have mechanisms for indicating that packets are OAM frames that should be handled specially (either as high priority or not forwarded beyond an endpoint). This provides support for allowing those types of packets to be matched. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + net/openvswitch/datapath.c | 1 + net/openvswitch/flow_netlink.c | 17 ++++++++++++----- 3 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index f7fc507d82ab..7c06106f5af5 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -309,6 +309,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */ OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ + OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9e3a2fae6a8f..f6bd93d5f435 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -369,6 +369,7 @@ static size_t key_attr_size(void) + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index f4c8daa73965..22c855fa0bc2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -346,6 +346,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_TTL] = 1, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + [OVS_TUNNEL_KEY_ATTR_OAM] = 0, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -390,6 +391,9 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_CSUM: tun_flags |= TUNNEL_CSUM; break; + case OVS_TUNNEL_KEY_ATTR_OAM: + tun_flags |= TUNNEL_OAM; + break; default: return -EINVAL; } @@ -431,21 +435,24 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; if (output->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src)) return -EMSGSIZE; if (output->ipv4_dst && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst)) return -EMSGSIZE; if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) return -EMSGSIZE; if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; nla_nest_end(skb, nla); -- cgit v1.2.3 From f0b128c1e2cc33ad104daf0f51a51e34f7763c5f Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:31 -0700 Subject: openvswitch: Wrap struct ovs_key_ipv4_tunnel in a new structure. Currently, the flow information that is matched for tunnels and the tunnel data passed around with packets is the same. However, as additional information is added this is not necessarily desirable, as in the case of pointers. This adds a new structure for tunnel metadata which currently contains only the existing struct. This change is purely internal to the kernel since the current OVS_KEY_ATTR_IPV4_TUNNEL is simply a compressed version of OVS_KEY_ATTR_TUNNEL that is translated at flow setup. Signed-off-by: Jesse Gross Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/actions.c | 5 +++-- net/openvswitch/datapath.h | 2 +- net/openvswitch/flow.c | 6 +++--- net/openvswitch/flow.h | 30 +++++++++++++++++------------- net/openvswitch/flow_netlink.c | 38 +++++++++++++++++++++++++++++++------- net/openvswitch/vport-gre.c | 16 +++++++++------- net/openvswitch/vport-vxlan.c | 10 +++++----- net/openvswitch/vport.c | 6 +++--- net/openvswitch/vport.h | 2 +- 10 files changed, 74 insertions(+), 43 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7c06106f5af5..6753032832e2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -294,7 +294,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ - OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ #endif __OVS_KEY_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 6932a42e41a2..006886dbee36 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -590,8 +590,8 @@ static int execute_set_action(struct sk_buff *skb, skb->mark = nla_get_u32(nested_attr); break; - case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->egress_tun_key = nla_data(nested_attr); + case OVS_KEY_ATTR_TUNNEL_INFO: + OVS_CB(skb)->egress_tun_info = nla_data(nested_attr); break; case OVS_KEY_ATTR_ETHERNET: @@ -778,6 +778,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); this_cpu_inc(exec_actions_level); + OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index ac3f3df96961..974135439c5c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -102,8 +102,8 @@ struct datapath { */ struct ovs_skb_cb { struct sw_flow *flow; + struct ovs_tunnel_info *egress_tun_info; struct vport *input_vport; - struct ovs_key_ipv4_tunnel *egress_tun_key; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 913bdc1a83c0..2924cb340868 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -642,12 +642,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_key) - memcpy(&key->tun_key, tun_key, sizeof(key->tun_key)); + if (tun_info) + memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); else memset(&key->tun_key, 0, sizeof(key->tun_key)); diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 0f5db4ec565d..fe5a71b81c1f 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -49,20 +49,24 @@ struct ovs_key_ipv4_tunnel { u8 ipv4_ttl; } __packed __aligned(4); /* Minimize padding. */ -static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) +struct ovs_tunnel_info { + struct ovs_key_ipv4_tunnel tunnel; +}; + +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be64 tun_id, __be16 tun_flags) { - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; + tun_info->tunnel.tun_id = tun_id; + tun_info->tunnel.ipv4_src = iph->saddr; + tun_info->tunnel.ipv4_dst = iph->daddr; + tun_info->tunnel.ipv4_tos = iph->tos; + tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.tun_flags = tun_flags; /* clear struct padding. */ - memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, + sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); } struct sw_flow_key { @@ -190,8 +194,8 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(struct ovs_key_ipv4_tunnel *tun_key, - struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, + struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 22c855fa0bc2..5d6194d9dadc 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1148,13 +1148,14 @@ out: return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); } -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len) { struct nlattr *a; a = reserve_sfa_size(sfa, nla_attr_size(len)); if (IS_ERR(a)) - return PTR_ERR(a); + return a; a->nla_type = attrtype; a->nla_len = nla_attr_size(len); @@ -1163,6 +1164,18 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, in memcpy(nla_data(a), data, len); memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + return a; +} + +static int add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len); + if (IS_ERR(a)) + return PTR_ERR(a); + return 0; } @@ -1268,6 +1281,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct ovs_tunnel_info *tun_info; + struct nlattr *a; int err, start; ovs_match_init(&match, &key, NULL); @@ -1279,8 +1294,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &match.key->tun_key, - sizeof(match.key->tun_key)); + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*tun_info)); + if (IS_ERR(a)) + return PTR_ERR(a); + + tun_info = nla_data(a); + tun_info->tunnel = key.tun_key; + add_nested_action_end(*sfa, start); return err; @@ -1563,17 +1584,20 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) int err; switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, nla_data(ovs_key), - nla_data(ovs_key)); + err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + nla_data(ovs_key)); if (err) return err; nla_nest_end(skb, start); break; + } default: if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) return -EMSGSIZE; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 309cca6e816f..fe768bd600eb 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,8 +63,10 @@ static __be16 filter_tnl_flags(__be16 flags) static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->egress_tun_key; struct tnl_ptk_info tpi; + const struct ovs_key_ipv4_tunnel *tun_key; + + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -92,7 +94,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -103,10 +105,10 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + filter_tnl_flags(tpi->flags)); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; } @@ -137,12 +139,12 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index f19539bb8adc..5fbff2c1ee49 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -58,7 +58,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) /* Called with rcu_read_lock and BH disabled. */ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) { - struct ovs_key_ipv4_tunnel tun_key; + struct ovs_tunnel_info tun_info; struct vport *vport = vs->data; struct iphdr *iph; __be64 key; @@ -66,9 +66,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); - ovs_vport_receive(vport, skb, &tun_key); + ovs_vport_receive(vport, skb, &tun_info); } static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) @@ -147,12 +147,12 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) __be16 df; int err; - if (unlikely(!OVS_CB(skb)->egress_tun_key)) { + if (unlikely(!OVS_CB(skb)->egress_tun_info)) { err = -EINVAL; goto error; } - tun_key = OVS_CB(skb)->egress_tun_key; + tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; /* Route lookup */ memset(&fl, 0, sizeof(fl)); fl.daddr = tun_key->ipv4_dst; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 5df8377fcfb1..3e50ee8a218c 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -432,7 +432,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) + struct ovs_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -445,9 +445,9 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_key = NULL; + OVS_CB(skb)->egress_tun_info = NULL; /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(tun_key, skb, &key); + error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); return; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 0efd62fef1e8..e28964aba021 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -207,7 +207,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_key_ipv4_tunnel *); + struct ovs_tunnel_info *); /* List of statically compiled vport implementations. Don't forget to also * add yours to the list at the top of vport.c. */ -- cgit v1.2.3 From f5796684069e0c71c65bce6a6d4766114aec1396 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Fri, 3 Oct 2014 15:35:33 -0700 Subject: openvswitch: Add support for Geneve tunneling. The Openvswitch implementation is completely agnostic to the options that are in use and can handle newly defined options without further work. It does this by simply matching on a byte array of options and allowing userspace to setup flows on this array. Signed-off-by: Jesse Gross Singed-off-by: Ansis Atteka Signed-off-by: Andy Zhou Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 21 ++-- include/uapi/linux/openvswitch.h | 2 + net/openvswitch/Kconfig | 11 ++ net/openvswitch/Makefile | 4 + net/openvswitch/datapath.c | 5 +- net/openvswitch/flow.c | 20 +++- net/openvswitch/flow.h | 20 +++- net/openvswitch/flow_netlink.c | 176 ++++++++++++++++++++++++----- net/openvswitch/vport-geneve.c | 236 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- net/openvswitch/vport.c | 3 + net/openvswitch/vport.h | 1 + 13 files changed, 461 insertions(+), 42 deletions(-) create mode 100644 net/openvswitch/vport-geneve.c (limited to 'include/uapi') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a9ce1556d773..5bc6edeb7143 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -86,17 +86,18 @@ struct ip_tunnel { struct gro_cells gro_cells; }; -#define TUNNEL_CSUM __cpu_to_be16(0x01) -#define TUNNEL_ROUTING __cpu_to_be16(0x02) -#define TUNNEL_KEY __cpu_to_be16(0x04) -#define TUNNEL_SEQ __cpu_to_be16(0x08) -#define TUNNEL_STRICT __cpu_to_be16(0x10) -#define TUNNEL_REC __cpu_to_be16(0x20) -#define TUNNEL_VERSION __cpu_to_be16(0x40) -#define TUNNEL_NO_KEY __cpu_to_be16(0x80) +#define TUNNEL_CSUM __cpu_to_be16(0x01) +#define TUNNEL_ROUTING __cpu_to_be16(0x02) +#define TUNNEL_KEY __cpu_to_be16(0x04) +#define TUNNEL_SEQ __cpu_to_be16(0x08) +#define TUNNEL_STRICT __cpu_to_be16(0x10) +#define TUNNEL_REC __cpu_to_be16(0x20) +#define TUNNEL_VERSION __cpu_to_be16(0x40) +#define TUNNEL_NO_KEY __cpu_to_be16(0x80) #define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) -#define TUNNEL_OAM __cpu_to_be16(0x0200) -#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OAM __cpu_to_be16(0x0200) +#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) +#define TUNNEL_OPTIONS_PRESENT __cpu_to_be16(0x0800) struct tnl_ptk_info { __be16 flags; diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 6753032832e2..435eabc5ffaa 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -192,6 +192,7 @@ enum ovs_vport_type { OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */ OVS_VPORT_TYPE_GRE, /* GRE tunnel. */ OVS_VPORT_TYPE_VXLAN, /* VXLAN tunnel. */ + OVS_VPORT_TYPE_GENEVE, /* Geneve tunnel. */ __OVS_VPORT_TYPE_MAX }; @@ -310,6 +311,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */ OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ + OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ __OVS_TUNNEL_KEY_ATTR_MAX }; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 6ecf491ad509..ba3bb8203b99 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -54,3 +54,14 @@ config OPENVSWITCH_VXLAN Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_GENEVE + bool "Open vSwitch Geneve tunneling support" + depends on INET + depends on OPENVSWITCH + depends on GENEVE && !(OPENVSWITCH=y && GENEVE=m) + default y + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 3591cb5dae91..9a33a273c375 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,10 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +ifneq ($(CONFIG_OPENVSWITCH_GENEVE),) +openvswitch-y += vport-geneve.o +endif + ifneq ($(CONFIG_OPENVSWITCH_VXLAN),) openvswitch-y += vport-vxlan.o endif diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 010125c48244..2e31d9e7f4dc 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -370,6 +370,7 @@ static size_t key_attr_size(void) + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ @@ -556,10 +557,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); - rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; + rcu_assign_pointer(flow->sf_acts, acts); + + OVS_CB(packet)->egress_tun_info = NULL; OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2924cb340868..62db02ba36bc 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -448,6 +448,9 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) int error; struct ethhdr *eth; + /* Flags are always used as part of stats */ + key->tp.flags = 0; + skb_reset_mac_header(skb); /* Link layer. We are guaranteed to have at least the 14 byte Ethernet @@ -646,10 +649,23 @@ int ovs_flow_key_extract(struct ovs_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ - if (tun_info) + if (tun_info) { memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); - else + + if (tun_info->options) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + memcpy(GENEVE_OPTS(key, tun_info->options_len), + tun_info->options, tun_info->options_len); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_opts_len = 0; memset(&key->tun_key, 0, sizeof(key->tun_key)); + } key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe5a71b81c1f..71813318c8c7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -51,11 +51,24 @@ struct ovs_key_ipv4_tunnel { struct ovs_tunnel_info { struct ovs_key_ipv4_tunnel tunnel; + struct geneve_opt *options; + u8 options_len; }; +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define GENEVE_OPTS(flow_key, opt_len) \ + ((struct geneve_opt *)((flow_key)->tun_opts + \ + FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ + opt_len)) + static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, const struct iphdr *iph, - __be64 tun_id, __be16 tun_flags) + __be64 tun_id, __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) { tun_info->tunnel.tun_id = tun_id; tun_info->tunnel.ipv4_src = iph->saddr; @@ -67,9 +80,14 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, /* clear struct padding. */ memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; } struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 5d6194d9dadc..368f23307911 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -88,18 +89,20 @@ static void update_range__(struct sw_flow_match *match, } \ } while (0) -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - do { \ - update_range__(match, offsetof(struct sw_flow_key, field), \ - len, is_mask); \ - if (is_mask) { \ - if ((match)->mask) \ - memcpy(&(match)->mask->key.field, value_p, len);\ - } else { \ - memcpy(&(match)->key->field, value_p, len); \ - } \ +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range__(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ } while (0) +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; @@ -335,6 +338,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, int rem; bool ttl = false; __be16 tun_flags = 0; + unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); @@ -347,6 +351,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, [OVS_TUNNEL_KEY_ATTR_OAM] = 0, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { @@ -355,7 +360,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return -EINVAL; } - if (ovs_tunnel_key_lens[type] != nla_len(a)) { + if (ovs_tunnel_key_lens[type] != nla_len(a) && + ovs_tunnel_key_lens[type] != -1) { OVS_NLERR("IPv4 tunnel attribute type has unexpected " " length (type=%d, length=%d, expected=%d).\n", type, nla_len(a), ovs_tunnel_key_lens[type]); @@ -394,7 +400,60 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_OAM: tun_flags |= TUNNEL_OAM; break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + tun_flags |= TUNNEL_OPTIONS_PRESENT; + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", + nla_len(a), + sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", + match->key->tun_opts_len, + nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, + true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS( + (struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, + nla_data(a), nla_len(a), + is_mask); + break; default: + OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + type); return -EINVAL; } } @@ -421,16 +480,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, return 0; } -static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key, - const struct ovs_key_ipv4_tunnel *output) +static int __ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) { - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; @@ -454,12 +508,35 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; + if (tun_opts && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; - nla_nest_end(skb, nla); return 0; } +static int ipv4_tun_to_nlattr(struct sk_buff *skb, + const struct ovs_key_ipv4_tunnel *output, + const struct geneve_opt *tun_opts, + int swkey_tun_opts_len) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ipv4_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { @@ -905,9 +982,16 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask) && - ipv4_tun_to_nlattr(skb, &swkey->tun_key, &output->tun_key)) - goto nla_put_failure; + if ((swkey->tun_key.ipv4_dst || is_mask)) { + const struct geneve_opt *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = GENEVE_OPTS(output, swkey->tun_opts_len); + + if (ipv4_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len)) + goto nla_put_failure; + } if (swkey->phy.in_port == DP_MAX_PORTS) { if (is_mask && (output->phy.in_port == 0xffff)) @@ -1290,17 +1374,55 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (err) return err; + if (key.tun_opts_len) { + struct geneve_opt *option = GENEVE_OPTS(&key, + key.tun_opts_len); + int opts_len = key.tun_opts_len; + bool crit_opt = false; + + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + }; + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); if (start < 0) return start; a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info)); + sizeof(*tun_info) + key.tun_opts_len); if (IS_ERR(a)) return PTR_ERR(a); tun_info = nla_data(a); tun_info->tunnel = key.tun_key; + tun_info->options_len = key.tun_opts_len; + + if (tun_info->options_len) { + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + memcpy((tun_info + 1), GENEVE_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); + tun_info->options = (struct geneve_opt *)(tun_info + 1); + } else { + tun_info->options = NULL; + } add_nested_action_end(*sfa, start); @@ -1592,7 +1714,9 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) return -EMSGSIZE; err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, - nla_data(ovs_key)); + tun_info->options_len ? + tun_info->options : NULL, + tun_info->options_len); if (err) return err; nla_nest_end(skb, start); diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000000..5572d482f285 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" + +/** + * struct geneve_port - Keeps track of open UDP ports + * @sock: The socket created for this port number. + * @name: vport name. + */ +struct geneve_port { + struct geneve_sock *gs; + char name[IFNAMSIZ]; +}; + +static LIST_HEAD(geneve_ports); + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + +/* Convert 64 bit tunnel ID to 24 bit VNI. */ +static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) +{ +#ifdef __BIG_ENDIAN + vni[0] = (__force __u8)(tun_id >> 16); + vni[1] = (__force __u8)(tun_id >> 8); + vni[2] = (__force __u8)tun_id; +#else + vni[0] = (__force __u8)((__force u64)tun_id >> 40); + vni[1] = (__force __u8)((__force u64)tun_id >> 48); + vni[2] = (__force __u8)((__force u64)tun_id >> 56); +#endif +} + +/* Convert 24 bit VNI to 64 bit tunnel ID. */ +static __be64 vni_to_tunnel_id(__u8 *vni) +{ +#ifdef __BIG_ENDIAN + return (vni[0] << 16) | (vni[1] << 8) | vni[2]; +#else + return (__force __be64)(((__force u64)vni[0] << 40) | + ((__force u64)vni[1] << 48) | + ((__force u64)vni[2] << 56)); +#endif +} + +static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) +{ + struct vport *vport = gs->rcv_data; + struct genevehdr *geneveh = geneve_hdr(skb); + int opts_len; + struct ovs_tunnel_info tun_info; + __be64 key; + __be16 flags; + + opts_len = geneveh->opt_len * 4; + + flags = TUNNEL_KEY | TUNNEL_OPTIONS_PRESENT | + (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | + (geneveh->oam ? TUNNEL_OAM : 0) | + (geneveh->critical ? TUNNEL_CRIT_OPT : 0); + + key = vni_to_tunnel_id(geneveh->vni); + + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + geneveh->options, opts_len); + + ovs_vport_receive(vport, skb, &tun_info); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 sport; + + sport = ntohs(inet_sk(geneve_port->gs->sock->sk)->inet_sport); + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, sport)) + return -EMSGSIZE; + return 0; +} + +static void geneve_tnl_destroy(struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + geneve_sock_release(geneve_port->gs); + + ovs_vport_deferred_free(vport); +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct geneve_sock *gs; + struct vport *vport; + struct nlattr *a; + int err; + u16 dst_port; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + strncpy(geneve_port->name, parms->name, IFNAMSIZ); + + gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); + if (IS_ERR(gs)) { + ovs_vport_free(vport); + return (void *)gs; + } + geneve_port->gs = gs; + + return vport; +error: + return ERR_PTR(err); +} + +static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +{ + struct ovs_key_ipv4_tunnel *tun_key; + struct ovs_tunnel_info *tun_info; + struct net *net = ovs_dp_get_net(vport->dp); + struct geneve_port *geneve_port = geneve_vport(vport); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport; + struct rtable *rt; + struct flowi4 fl; + u8 vni[3]; + __be16 df; + int err; + + tun_info = OVS_CB(skb)->egress_tun_info; + if (unlikely(!tun_info)) { + err = -EINVAL; + goto error; + } + + tun_key = &tun_info->tunnel; + + /* Route lookup */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_UDP; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto error; + } + + df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + tunnel_id_to_vni(tun_key->tun_id, vni); + skb->ignore_df = 1; + + err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, + tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->ipv4_ttl, df, sport, dport, + tun_key->tun_flags, vni, + tun_info->options_len, (u8 *)tun_info->options, + false); + if (err < 0) + ip_rt_put(rt); +error: + return err; +} + +static const char *geneve_get_name(const struct vport *vport) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + return geneve_port->name; +} + +const struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_tnl_create, + .destroy = geneve_tnl_destroy, + .get_name = geneve_get_name, + .get_options = geneve_get_options, + .send = geneve_tnl_send, +}; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index fe768bd600eb..108b82da2fd9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -106,7 +106,7 @@ static int gre_rcv(struct sk_buff *skb, key = key_to_tunnel_id(tpi->key, tpi->seq); ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 5fbff2c1ee49..2735e01dca73 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -66,7 +66,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY); + ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); ovs_vport_receive(vport, skb, &tun_info); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 3e50ee8a218c..53001b020ca7 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -48,6 +48,9 @@ static const struct vport_ops *vport_ops_list[] = { #ifdef CONFIG_OPENVSWITCH_VXLAN &ovs_vxlan_vport_ops, #endif +#ifdef CONFIG_OPENVSWITCH_GENEVE + &ovs_geneve_vport_ops, +#endif }; /* Protected by RCU read lock for reading, ovs_mutex for writing. */ diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e28964aba021..8942125de3a6 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -215,6 +215,7 @@ extern const struct vport_ops ovs_netdev_vport_ops; extern const struct vport_ops ovs_internal_vport_ops; extern const struct vport_ops ovs_gre_vport_ops; extern const struct vport_ops ovs_vxlan_vport_ops; +extern const struct vport_ops ovs_geneve_vport_ops; static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) -- cgit v1.2.3 From 1255a5055449781a92076fc5429952f2b33cf309 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 5 Oct 2014 12:35:21 +0300 Subject: ethtool: Ethtool parameter to dynamically change tx_copybreak Use new ethtool [sg]et_tunable() to set tx_copybread (inline threshold) Signed-off-by: Eric Dumazet Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 1 + net/core/ethtool.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7a364f2f3d3f..99b43056a6fe 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -212,6 +212,7 @@ struct ethtool_value { enum tunable_id { ETHTOOL_ID_UNSPEC, ETHTOOL_RX_COPYBREAK, + ETHTOOL_TX_COPYBREAK, }; enum tunable_type_id { diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 27e61b886520..1600aa24d36b 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1625,6 +1625,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: + case ETHTOOL_TX_COPYBREAK: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; -- cgit v1.2.3 From f0d1f04f0a2f662b6b617e24d115fddcf6ef8723 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 7 Oct 2014 19:02:11 +0200 Subject: netfilter: fix wrong arithmetics regarding NFT_REJECT_ICMPX_MAX NFT_REJECT_ICMPX_MAX should be __NFT_REJECT_ICMPX_MAX - 1. nft_reject_icmp_code() and nft_reject_icmpv6_code() are called from the packet path, so BUG_ON in case we try to access an unknown abstracted ICMP code. This should not happen since we already validate this from nft_reject_{inet,bridge}_init(). Fixes: 51b0a5d ("netfilter: nft_reject: introduce icmp code abstraction for inet and bridge") Reported-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 +- net/netfilter/nft_reject.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index c26df6787fb0..f31fe7b660a5 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -774,7 +774,7 @@ enum nft_reject_inet_code { NFT_REJECT_ICMPX_ADMIN_PROHIBITED, __NFT_REJECT_ICMPX_MAX }; -#define NFT_REJECT_ICMPX_MAX (__NFT_REJECT_ICMPX_MAX + 1) +#define NFT_REJECT_ICMPX_MAX (__NFT_REJECT_ICMPX_MAX - 1) /** * enum nft_reject_attributes - nf_tables reject expression netlink attributes diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index ec8a456092a7..57d3e1af5630 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -72,7 +72,7 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_reject_dump); -static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX] = { +static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, @@ -81,8 +81,7 @@ static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX] = { int nft_reject_icmp_code(u8 code) { - if (code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; + BUG_ON(code > NFT_REJECT_ICMPX_MAX); return icmp_code_v4[code]; } @@ -90,7 +89,7 @@ int nft_reject_icmp_code(u8 code) EXPORT_SYMBOL_GPL(nft_reject_icmp_code); -static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX] = { +static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, @@ -99,8 +98,7 @@ static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX] = { int nft_reject_icmpv6_code(u8 code) { - if (code > NFT_REJECT_ICMPX_MAX) - return -EINVAL; + BUG_ON(code > NFT_REJECT_ICMPX_MAX); return icmp_code_v6[code]; } -- cgit v1.2.3 From 66b43081c0bde3171208a7cb52f5807dce4a79e4 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:55:03 +1100 Subject: cxl: Add userspace header file This adds a header file for use by userspace programs wanting to interact with the kernel cxl driver. It defines structs and magic numbers required for userspace to interact with devices in /dev/cxl/afuM.N. Further documentation on this interface is added in a subsequent patch in Documentation/powerpc/cxl.txt. It also adds this new userspace header file to Kbuild so it's exported when doing "make headers_installs". Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- include/uapi/Kbuild | 1 + include/uapi/misc/Kbuild | 2 ++ include/uapi/misc/cxl.h | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 include/uapi/misc/Kbuild create mode 100644 include/uapi/misc/cxl.h (limited to 'include/uapi') diff --git a/include/uapi/Kbuild b/include/uapi/Kbuild index 81d2106287fe..245aa6e05e6a 100644 --- a/include/uapi/Kbuild +++ b/include/uapi/Kbuild @@ -12,3 +12,4 @@ header-y += video/ header-y += drm/ header-y += xen/ header-y += scsi/ +header-y += misc/ diff --git a/include/uapi/misc/Kbuild b/include/uapi/misc/Kbuild new file mode 100644 index 000000000000..e96cae7d58c9 --- /dev/null +++ b/include/uapi/misc/Kbuild @@ -0,0 +1,2 @@ +# misc Header export list +header-y += cxl.h diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h new file mode 100644 index 000000000000..c232be6ae21f --- /dev/null +++ b/include/uapi/misc/cxl.h @@ -0,0 +1,87 @@ +/* + * Copyright 2014 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_MISC_CXL_H +#define _UAPI_MISC_CXL_H + +#include +#include + +/* Structs for IOCTLS for userspace to talk to the kernel */ +struct cxl_ioctl_start_work { + __u64 flags; + __u64 work_element_descriptor; + __u64 amr; + __s16 num_interrupts; + __s16 reserved1; + __s32 reserved2; + __u64 reserved3; + __u64 reserved4; + __u64 reserved5; + __u64 reserved6; +}; +#define CXL_START_WORK_AMR 0x0000000000000001ULL +#define CXL_START_WORK_NUM_IRQS 0x0000000000000002ULL +#define CXL_START_WORK_ALL (CXL_START_WORK_AMR |\ + CXL_START_WORK_NUM_IRQS) + +/* IOCTL numbers */ +#define CXL_MAGIC 0xCA +#define CXL_IOCTL_START_WORK _IOW(CXL_MAGIC, 0x00, struct cxl_ioctl_start_work) +#define CXL_IOCTL_GET_PROCESS_ELEMENT _IOR(CXL_MAGIC, 0x01, __u32) + +/* Events from read() */ +#define CXL_READ_MIN_SIZE 0x1000 /* 4K */ + +enum cxl_event_type { + CXL_EVENT_RESERVED = 0, + CXL_EVENT_AFU_INTERRUPT = 1, + CXL_EVENT_DATA_STORAGE = 2, + CXL_EVENT_AFU_ERROR = 3, +}; + +struct cxl_event_header { + __u16 type; + __u16 size; + __u16 process_element; + __u16 reserved1; +}; + +struct cxl_event_afu_interrupt { + __u16 flags; + __u16 irq; /* Raised AFU interrupt number */ + __u32 reserved1; +}; + +struct cxl_event_data_storage { + __u16 flags; + __u16 reserved1; + __u32 reserved2; + __u64 addr; + __u64 dsisr; + __u64 reserved3; +}; + +struct cxl_event_afu_error { + __u16 flags; + __u16 reserved1; + __u32 reserved2; + __u64 error; +}; + +struct cxl_event { + struct cxl_event_header header; + union { + struct cxl_event_afu_interrupt irq; + struct cxl_event_data_storage fault; + struct cxl_event_afu_error afu_error; + }; +}; + +#endif /* _UAPI_MISC_CXL_H */ -- cgit v1.2.3 From a9282d01cf357379ce29103cec5e7651a53c634d Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:55:05 +1100 Subject: cxl: Add documentation for userspace APIs This documentation gives an overview of the hardware architecture, userspace APIs via /dev/cxl/afuM.N and the syfs files. It also adds a MAINTAINERS file entry for cxl. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- Documentation/ABI/testing/sysfs-class-cxl | 129 ++++++++++ Documentation/ioctl/ioctl-number.txt | 1 + Documentation/powerpc/00-INDEX | 2 + Documentation/powerpc/cxl.txt | 379 ++++++++++++++++++++++++++++++ MAINTAINERS | 12 + include/uapi/misc/cxl.h | 7 +- 6 files changed, 527 insertions(+), 3 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-cxl create mode 100644 Documentation/powerpc/cxl.txt (limited to 'include/uapi') diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl new file mode 100644 index 000000000000..554405ec1955 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-cxl @@ -0,0 +1,129 @@ +Slave contexts (eg. /sys/class/cxl/afu0.0s): + +What: /sys/class/cxl//irqs_max +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Decimal value of maximum number of interrupts that can be + requested by userspace. The default on probe is the maximum + that hardware can support (eg. 2037). Write values will limit + userspace applications to that many userspace interrupts. Must + be >= irqs_min. + +What: /sys/class/cxl//irqs_min +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the minimum number of interrupts that + userspace must request on a CXL_START_WORK ioctl. Userspace may + omit the num_interrupts field in the START_WORK IOCTL to get + this minimum automatically. + +What: /sys/class/cxl//mmio_size +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the size of the MMIO space that may be mmaped + by userspace. + +What: /sys/class/cxl//modes_supported +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + List of the modes this AFU supports. One per line. + Valid entries are: "dedicated_process" and "afu_directed" + +What: /sys/class/cxl//mode +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + The current mode the AFU is using. Will be one of the modes + given in modes_supported. Writing will change the mode + provided that no user contexts are attached. + + +What: /sys/class/cxl//prefault_mode +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Set the mode for prefaulting in segments into the segment table + when performing the START_WORK ioctl. Possible values: + none: No prefaulting (default) + work_element_descriptor: Treat the work element + descriptor as an effective address and + prefault what it points to. + all: all segments process calling START_WORK maps. + +What: /sys/class/cxl//reset +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: write only + Writing 1 here will reset the AFU provided there are not + contexts active on the AFU. + +What: /sys/class/cxl//api_version +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the current version of the kernel/user API. + +What: /sys/class/cxl//api_version_com +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the the lowest version of the userspace API + this this kernel supports. + + + +Master contexts (eg. /sys/class/cxl/afu0.0m) + +What: /sys/class/cxl/m/mmio_size +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the size of the MMIO space that may be mmaped + by userspace. This includes all slave contexts space also. + +What: /sys/class/cxl/m/pp_mmio_len +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the Per Process MMIO space length. + +What: /sys/class/cxl/m/pp_mmio_off +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Decimal value of the Per Process MMIO space offset. + + +Card info (eg. /sys/class/cxl/card0) + +What: /sys/class/cxl//caia_version +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Identifies the CAIA Version the card implements. + +What: /sys/class/cxl//psl_version +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Identifies the revision level of the PSL. + +What: /sys/class/cxl//base_image +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Identifies the revision level of the base image for devices + that support loadable PSLs. For FPGAs this field identifies + the image contained in the on-adapter flash which is loaded + during the initial program load. + +What: /sys/class/cxl//image_loaded +Date: September 2014 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Will return "user" or "factory" depending on the image loaded + onto the card. diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 7e240a7c9ab1..8136e1fd30fd 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -313,6 +313,7 @@ Code Seq#(hex) Include File Comments 0xB1 00-1F PPPoX 0xB3 00 linux/mmc/ioctl.h 0xC0 00-0F linux/usb/iowarrior.h +0xCA 00-0F uapi/misc/cxl.h 0xCB 00-1F CBM serial IEC bus in development: 0xCD 01 linux/reiserfs_fs.h diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX index a68784d0a1ee..6fd0e8bb8140 100644 --- a/Documentation/powerpc/00-INDEX +++ b/Documentation/powerpc/00-INDEX @@ -11,6 +11,8 @@ bootwrapper.txt cpu_features.txt - info on how we support a variety of CPUs with minimal compile-time options. +cxl.txt + - Overview of the CXL driver. eeh-pci-error-recovery.txt - info on PCI Bus EEH Error Recovery firmware-assisted-dump.txt diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt new file mode 100644 index 000000000000..2c71ecc519d9 --- /dev/null +++ b/Documentation/powerpc/cxl.txt @@ -0,0 +1,379 @@ +Coherent Accelerator Interface (CXL) +==================================== + +Introduction +============ + + The coherent accelerator interface is designed to allow the + coherent connection of accelerators (FPGAs and other devices) to a + POWER system. These devices need to adhere to the Coherent + Accelerator Interface Architecture (CAIA). + + IBM refers to this as the Coherent Accelerator Processor Interface + or CAPI. In the kernel it's referred to by the name CXL to avoid + confusion with the ISDN CAPI subsystem. + + Coherent in this context means that the accelerator and CPUs can + both access system memory directly and with the same effective + addresses. + + +Hardware overview +================= + + POWER8 FPGA + +----------+ +---------+ + | | | | + | CPU | | AFU | + | | | | + | | | | + | | | | + +----------+ +---------+ + | PHB | | | + | +------+ | PSL | + | | CAPP |<------>| | + +---+------+ PCIE +---------+ + + The POWER8 chip has a Coherently Attached Processor Proxy (CAPP) + unit which is part of the PCIe Host Bridge (PHB). This is managed + by Linux by calls into OPAL. Linux doesn't directly program the + CAPP. + + The FPGA (or coherently attached device) consists of two parts. + The POWER Service Layer (PSL) and the Accelerator Function Unit + (AFU). The AFU is used to implement specific functionality behind + the PSL. The PSL, among other things, provides memory address + translation services to allow each AFU direct access to userspace + memory. + + The AFU is the core part of the accelerator (eg. the compression, + crypto etc function). The kernel has no knowledge of the function + of the AFU. Only userspace interacts directly with the AFU. + + The PSL provides the translation and interrupt services that the + AFU needs. This is what the kernel interacts with. For example, if + the AFU needs to read a particular effective address, it sends + that address to the PSL, the PSL then translates it, fetches the + data from memory and returns it to the AFU. If the PSL has a + translation miss, it interrupts the kernel and the kernel services + the fault. The context to which this fault is serviced is based on + who owns that acceleration function. + + +AFU Modes +========= + + There are two programming modes supported by the AFU. Dedicated + and AFU directed. AFU may support one or both modes. + + When using dedicated mode only one MMU context is supported. In + this mode, only one userspace process can use the accelerator at + time. + + When using AFU directed mode, up to 16K simultaneous contexts can + be supported. This means up to 16K simultaneous userspace + applications may use the accelerator (although specific AFUs may + support fewer). In this mode, the AFU sends a 16 bit context ID + with each of its requests. This tells the PSL which context is + associated with each operation. If the PSL can't translate an + operation, the ID can also be accessed by the kernel so it can + determine the userspace context associated with an operation. + + +MMIO space +========== + + A portion of the accelerator MMIO space can be directly mapped + from the AFU to userspace. Either the whole space can be mapped or + just a per context portion. The hardware is self describing, hence + the kernel can determine the offset and size of the per context + portion. + + +Interrupts +========== + + AFUs may generate interrupts that are destined for userspace. These + are received by the kernel as hardware interrupts and passed onto + userspace by a read syscall documented below. + + Data storage faults and error interrupts are handled by the kernel + driver. + + +Work Element Descriptor (WED) +============================= + + The WED is a 64-bit parameter passed to the AFU when a context is + started. Its format is up to the AFU hence the kernel has no + knowledge of what it represents. Typically it will be the + effective address of a work queue or status block where the AFU + and userspace can share control and status information. + + + + +User API +======== + + For AFUs operating in AFU directed mode, two character device + files will be created. /dev/cxl/afu0.0m will correspond to a + master context and /dev/cxl/afu0.0s will correspond to a slave + context. Master contexts have access to the full MMIO space an + AFU provides. Slave contexts have access to only the per process + MMIO space an AFU provides. + + For AFUs operating in dedicated process mode, the driver will + only create a single character device per AFU called + /dev/cxl/afu0.0d. This will have access to the entire MMIO space + that the AFU provides (like master contexts in AFU directed). + + The types described below are defined in include/uapi/misc/cxl.h + + The following file operations are supported on both slave and + master devices. + + +open +---- + + Opens the device and allocates a file descriptor to be used with + the rest of the API. + + A dedicated mode AFU only has one context and only allows the + device to be opened once. + + An AFU directed mode AFU can have many contexts, the device can be + opened once for each context that is available. + + When all available contexts are allocated the open call will fail + and return -ENOSPC. + + Note: IRQs need to be allocated for each context, which may limit + the number of contexts that can be created, and therefore + how many times the device can be opened. The POWER8 CAPP + supports 2040 IRQs and 3 are used by the kernel, so 2037 are + left. If 1 IRQ is needed per context, then only 2037 + contexts can be allocated. If 4 IRQs are needed per context, + then only 2037/4 = 509 contexts can be allocated. + + +ioctl +----- + + CXL_IOCTL_START_WORK: + Starts the AFU context and associates it with the current + process. Once this ioctl is successfully executed, all memory + mapped into this process is accessible to this AFU context + using the same effective addresses. No additional calls are + required to map/unmap memory. The AFU memory context will be + updated as userspace allocates and frees memory. This ioctl + returns once the AFU context is started. + + Takes a pointer to a struct cxl_ioctl_start_work: + + struct cxl_ioctl_start_work { + __u64 flags; + __u64 work_element_descriptor; + __u64 amr; + __s16 num_interrupts; + __s16 reserved1; + __s32 reserved2; + __u64 reserved3; + __u64 reserved4; + __u64 reserved5; + __u64 reserved6; + }; + + flags: + Indicates which optional fields in the structure are + valid. + + work_element_descriptor: + The Work Element Descriptor (WED) is a 64-bit argument + defined by the AFU. Typically this is an effective + address pointing to an AFU specific structure + describing what work to perform. + + amr: + Authority Mask Register (AMR), same as the powerpc + AMR. This field is only used by the kernel when the + corresponding CXL_START_WORK_AMR value is specified in + flags. If not specified the kernel will use a default + value of 0. + + num_interrupts: + Number of userspace interrupts to request. This field + is only used by the kernel when the corresponding + CXL_START_WORK_NUM_IRQS value is specified in flags. + If not specified the minimum number required by the + AFU will be allocated. The min and max number can be + obtained from sysfs. + + reserved fields: + For ABI padding and future extensions + + CXL_IOCTL_GET_PROCESS_ELEMENT: + Get the current context id, also known as the process element. + The value is returned from the kernel as a __u32. + + +mmap +---- + + An AFU may have an MMIO space to facilitate communication with the + AFU. If it does, the MMIO space can be accessed via mmap. The size + and contents of this area are specific to the particular AFU. The + size can be discovered via sysfs. + + In AFU directed mode, master contexts are allowed to map all of + the MMIO space and slave contexts are allowed to only map the per + process MMIO space associated with the context. In dedicated + process mode the entire MMIO space can always be mapped. + + This mmap call must be done after the START_WORK ioctl. + + Care should be taken when accessing MMIO space. Only 32 and 64-bit + accesses are supported by POWER8. Also, the AFU will be designed + with a specific endianness, so all MMIO accesses should consider + endianness (recommend endian(3) variants like: le64toh(), + be64toh() etc). These endian issues equally apply to shared memory + queues the WED may describe. + + +read +---- + + Reads events from the AFU. Blocks if no events are pending + (unless O_NONBLOCK is supplied). Returns -EIO in the case of an + unrecoverable error or if the card is removed. + + read() will always return an integral number of events. + + The buffer passed to read() must be at least 4K bytes. + + The result of the read will be a buffer of one or more events, + each event is of type struct cxl_event, of varying size. + + struct cxl_event { + struct cxl_event_header header; + union { + struct cxl_event_afu_interrupt irq; + struct cxl_event_data_storage fault; + struct cxl_event_afu_error afu_error; + }; + }; + + The struct cxl_event_header is defined as: + + struct cxl_event_header { + __u16 type; + __u16 size; + __u16 process_element; + __u16 reserved1; + }; + + type: + This defines the type of event. The type determines how + the rest of the event is structured. These types are + described below and defined by enum cxl_event_type. + + size: + This is the size of the event in bytes including the + struct cxl_event_header. The start of the next event can + be found at this offset from the start of the current + event. + + process_element: + Context ID of the event. + + reserved field: + For future extensions and padding. + + If the event type is CXL_EVENT_AFU_INTERRUPT then the event + structure is defined as: + + struct cxl_event_afu_interrupt { + __u16 flags; + __u16 irq; /* Raised AFU interrupt number */ + __u32 reserved1; + }; + + flags: + These flags indicate which optional fields are present + in this struct. Currently all fields are mandatory. + + irq: + The IRQ number sent by the AFU. + + reserved field: + For future extensions and padding. + + If the event type is CXL_EVENT_DATA_STORAGE then the event + structure is defined as: + + struct cxl_event_data_storage { + __u16 flags; + __u16 reserved1; + __u32 reserved2; + __u64 addr; + __u64 dsisr; + __u64 reserved3; + }; + + flags: + These flags indicate which optional fields are present in + this struct. Currently all fields are mandatory. + + address: + The address that the AFU unsuccessfully attempted to + access. Valid accesses will be handled transparently by the + kernel but invalid accesses will generate this event. + + dsisr: + This field gives information on the type of fault. It is a + copy of the DSISR from the PSL hardware when the address + fault occurred. The form of the DSISR is as defined in the + CAIA. + + reserved fields: + For future extensions + + If the event type is CXL_EVENT_AFU_ERROR then the event structure + is defined as: + + struct cxl_event_afu_error { + __u16 flags; + __u16 reserved1; + __u32 reserved2; + __u64 error; + }; + + flags: + These flags indicate which optional fields are present in + this struct. Currently all fields are Mandatory. + + error: + Error status from the AFU. Defined by the AFU. + + reserved fields: + For future extensions and padding + +Sysfs Class +=========== + + A cxl sysfs class is added under /sys/class/cxl to facilitate + enumeration and tuning of the accelerators. Its layout is + described in Documentation/ABI/testing/sysfs-class-cxl + +Udev rules +========== + + The following udev rules could be used to create a symlink to the + most logical chardev to use in any programming mode (afuX.Yd for + dedicated, afuX.Ys for afu directed), since the API is virtually + identical for each: + + SUBSYSTEM=="cxl", ATTRS{mode}=="dedicated_process", SYMLINK="cxl/%b" + SUBSYSTEM=="cxl", ATTRS{mode}=="afu_directed", \ + KERNEL=="afu[0-9]*.[0-9]*s", SYMLINK="cxl/%b" diff --git a/MAINTAINERS b/MAINTAINERS index 809ecd680d88..facc2198eaba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2711,6 +2711,18 @@ W: http://www.chelsio.com S: Supported F: drivers/net/ethernet/chelsio/cxgb4vf/ +CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER +M: Ian Munsie +M: Michael Neuling +L: linuxppc-dev@lists.ozlabs.org +S: Supported +F: drivers/misc/cxl/ +F: include/misc/cxl.h +F: include/uapi/misc/cxl.h +F: Documentation/powerpc/cxl.txt +F: Documentation/powerpc/cxl.txt +F: Documentation/ABI/testing/sysfs-class-cxl + STMMAC ETHERNET DRIVER M: Giuseppe Cavallaro L: netdev@vger.kernel.org diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h index c232be6ae21f..cd6d789b73ec 100644 --- a/include/uapi/misc/cxl.h +++ b/include/uapi/misc/cxl.h @@ -13,7 +13,7 @@ #include #include -/* Structs for IOCTLS for userspace to talk to the kernel */ + struct cxl_ioctl_start_work { __u64 flags; __u64 work_element_descriptor; @@ -26,19 +26,20 @@ struct cxl_ioctl_start_work { __u64 reserved5; __u64 reserved6; }; + #define CXL_START_WORK_AMR 0x0000000000000001ULL #define CXL_START_WORK_NUM_IRQS 0x0000000000000002ULL #define CXL_START_WORK_ALL (CXL_START_WORK_AMR |\ CXL_START_WORK_NUM_IRQS) -/* IOCTL numbers */ +/* ioctl numbers */ #define CXL_MAGIC 0xCA #define CXL_IOCTL_START_WORK _IOW(CXL_MAGIC, 0x00, struct cxl_ioctl_start_work) #define CXL_IOCTL_GET_PROCESS_ELEMENT _IOR(CXL_MAGIC, 0x01, __u32) -/* Events from read() */ #define CXL_READ_MIN_SIZE 0x1000 /* 4K */ +/* Events from read() */ enum cxl_event_type { CXL_EVENT_RESERVED = 0, CXL_EVENT_AFU_INTERRUPT = 1, -- cgit v1.2.3 From 8070361799ae1e3f4ef347bd10f0a508ac10acfb Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 6 Oct 2014 17:53:53 +0200 Subject: s390: add support for vector extension The vector extension introduces 32 128-bit vector registers and a set of instruction to operate on the vector registers. The kernel can control the use of vector registers for the problem state program with a bit in control register 0. Once enabled for a process the kernel needs to retain the content of the vector registers on context switch. The signal frame is extended to include the vector registers. Two new register sets NT_S390_VXRS_LOW and NT_S390_VXRS_HIGH are added to the regset interface for the debugger and core dumps. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/elf.h | 1 + arch/s390/include/asm/lowcore.h | 10 +- arch/s390/include/asm/nmi.h | 2 +- arch/s390/include/asm/processor.h | 1 + arch/s390/include/asm/setup.h | 3 + arch/s390/include/asm/switch_to.h | 48 +++++- arch/s390/include/uapi/asm/sigcontext.h | 20 ++- arch/s390/include/uapi/asm/types.h | 4 + arch/s390/include/uapi/asm/ucontext.h | 15 +- arch/s390/kernel/compat_linux.h | 9 + arch/s390/kernel/compat_signal.c | 212 +++++++++++++++++------ arch/s390/kernel/early.c | 2 + arch/s390/kernel/entry.h | 3 + arch/s390/kernel/nmi.c | 16 ++ arch/s390/kernel/pgm_check.S | 2 +- arch/s390/kernel/processor.c | 2 +- arch/s390/kernel/ptrace.c | 249 +++++++++++++++++++++------ arch/s390/kernel/setup.c | 9 + arch/s390/kernel/signal.c | 296 +++++++++++++++++++++++++------- arch/s390/kernel/smp.c | 3 + arch/s390/kernel/traps.c | 82 +++++++++ include/uapi/linux/elf.h | 2 + 22 files changed, 804 insertions(+), 187 deletions(-) (limited to 'include/uapi') diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 78f4f8711d58..27735ae06a78 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -102,6 +102,7 @@ #define HWCAP_S390_ETF3EH 256 #define HWCAP_S390_HIGH_GPRS 512 #define HWCAP_S390_TE 1024 +#define HWCAP_S390_VXRS 2048 /* * These are used to set parameters in the core dumps. diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 4349197ab9df..d812cf1a8177 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -310,7 +310,10 @@ struct _lowcore { /* Extended facility list */ __u64 stfle_fac_list[32]; /* 0x0f00 */ - __u8 pad_0x1000[0x11b8-0x1000]; /* 0x1000 */ + __u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */ + + /* Pointer to vector register save area */ + __u64 vector_save_area_addr; /* 0x11b0 */ /* 64 bit extparam used for pfault/diag 250: defined by architecture */ __u64 ext_params2; /* 0x11B8 */ @@ -334,9 +337,10 @@ struct _lowcore { /* Transaction abort diagnostic block */ __u8 pgm_tdb[256]; /* 0x1800 */ + __u8 pad_0x1900[0x1c00-0x1900]; /* 0x1900 */ - /* align to the top of the prefix area */ - __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ + /* Software defined save area for vector registers */ + __u8 vector_save_area[1024]; /* 0x1c00 */ } __packed; #endif /* CONFIG_32BIT */ diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 35f8ec185616..3027a5a72b74 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -38,7 +38,7 @@ struct mci { __u32 pm : 1; /* 22 psw program mask and cc validity */ __u32 ia : 1; /* 23 psw instruction address validity */ __u32 fa : 1; /* 24 failing storage address validity */ - __u32 : 1; /* 25 */ + __u32 vr : 1; /* 25 vector register validity */ __u32 ec : 1; /* 26 external damage code validity */ __u32 fp : 1; /* 27 floating point register validity */ __u32 gr : 1; /* 28 general register validity */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 3d0871058306..d559bdb03d18 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -117,6 +117,7 @@ struct thread_struct { int ri_signum; #ifdef CONFIG_64BIT unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ + __vector128 *vxrs; /* Vector register save area */ #endif }; diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index dbde7c236577..7736fdd72595 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -56,6 +56,7 @@ extern void detect_memory_memblock(void); #define MACHINE_FLAG_TOPOLOGY (1UL << 14) #define MACHINE_FLAG_TE (1UL << 15) #define MACHINE_FLAG_TLB_LC (1UL << 17) +#define MACHINE_FLAG_VX (1UL << 18) #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) @@ -78,6 +79,7 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TOPOLOGY (0) #define MACHINE_HAS_TE (0) #define MACHINE_HAS_TLB_LC (0) +#define MACHINE_HAS_VX (0) #else /* CONFIG_64BIT */ #define MACHINE_HAS_IEEE (1) #define MACHINE_HAS_CSP (1) @@ -90,6 +92,7 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) +#define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) #endif /* CONFIG_64BIT */ /* diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index 18ea9e3f8142..0e0109578021 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -103,6 +103,48 @@ static inline void restore_fp_regs(freg_t *fprs) asm volatile("ld 15,%0" : : "Q" (fprs[15])); } +static inline void save_vx_regs(__vector128 *vxrs) +{ + typedef struct { __vector128 _[__NUM_VXRS]; } addrtype; + + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */ + : "=Q" (*(addrtype *) vxrs) : : "1"); +} + +static inline void restore_vx_regs(__vector128 *vxrs) +{ + typedef struct { __vector128 _[__NUM_VXRS]; } addrtype; + + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ + : : "Q" (*(addrtype *) vxrs) : "1"); +} + +static inline void save_fp_vx_regs(struct task_struct *task) +{ +#ifdef CONFIG_64BIT + if (task->thread.vxrs) + save_vx_regs(task->thread.vxrs); + else +#endif + save_fp_regs(task->thread.fp_regs.fprs); +} + +static inline void restore_fp_vx_regs(struct task_struct *task) +{ +#ifdef CONFIG_64BIT + if (task->thread.vxrs) + restore_vx_regs(task->thread.vxrs); + else +#endif + restore_fp_regs(task->thread.fp_regs.fprs); +} + static inline void save_access_regs(unsigned int *acrs) { typedef struct { int _[NUM_ACRS]; } acrstype; @@ -120,16 +162,16 @@ static inline void restore_access_regs(unsigned int *acrs) #define switch_to(prev,next,last) do { \ if (prev->mm) { \ save_fp_ctl(&prev->thread.fp_regs.fpc); \ - save_fp_regs(prev->thread.fp_regs.fprs); \ + save_fp_vx_regs(prev); \ save_access_regs(&prev->thread.acrs[0]); \ save_ri_cb(prev->thread.ri_cb); \ } \ if (next->mm) { \ + update_cr_regs(next); \ restore_fp_ctl(&next->thread.fp_regs.fpc); \ - restore_fp_regs(next->thread.fp_regs.fprs); \ + restore_fp_vx_regs(next); \ restore_access_regs(&next->thread.acrs[0]); \ restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ - update_cr_regs(next); \ } \ prev = __switch_to(prev,next); \ } while (0) diff --git a/arch/s390/include/uapi/asm/sigcontext.h b/arch/s390/include/uapi/asm/sigcontext.h index b30de9c01bbe..5f0b8d7ddb0b 100644 --- a/arch/s390/include/uapi/asm/sigcontext.h +++ b/arch/s390/include/uapi/asm/sigcontext.h @@ -7,10 +7,14 @@ #define _ASM_S390_SIGCONTEXT_H #include +#include -#define __NUM_GPRS 16 -#define __NUM_FPRS 16 -#define __NUM_ACRS 16 +#define __NUM_GPRS 16 +#define __NUM_FPRS 16 +#define __NUM_ACRS 16 +#define __NUM_VXRS 32 +#define __NUM_VXRS_LOW 16 +#define __NUM_VXRS_HIGH 16 #ifndef __s390x__ @@ -59,6 +63,16 @@ typedef struct _s390_fp_regs fpregs; } _sigregs; +typedef struct +{ +#ifndef __s390x__ + unsigned long gprs_high[__NUM_GPRS]; +#endif + unsigned long long vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + unsigned char __reserved[128]; +} _sigregs_ext; + struct sigcontext { unsigned long oldmask[_SIGCONTEXT_NSIG_WORDS]; diff --git a/arch/s390/include/uapi/asm/types.h b/arch/s390/include/uapi/asm/types.h index 038f2b9178a4..3c3951e3415b 100644 --- a/arch/s390/include/uapi/asm/types.h +++ b/arch/s390/include/uapi/asm/types.h @@ -17,6 +17,10 @@ typedef unsigned long addr_t; typedef __signed__ long saddr_t; +typedef struct { + __u32 u[4]; +} __vector128; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI_S390_TYPES_H */ diff --git a/arch/s390/include/uapi/asm/ucontext.h b/arch/s390/include/uapi/asm/ucontext.h index 3e077b2a4705..64a69aa5dde0 100644 --- a/arch/s390/include/uapi/asm/ucontext.h +++ b/arch/s390/include/uapi/asm/ucontext.h @@ -7,10 +7,15 @@ #ifndef _ASM_S390_UCONTEXT_H #define _ASM_S390_UCONTEXT_H -#define UC_EXTENDED 0x00000001 - -#ifndef __s390x__ +#define UC_GPRS_HIGH 1 /* uc_mcontext_ext has valid high gprs */ +#define UC_VXRS 2 /* uc_mcontext_ext has valid vector regs */ +/* + * The struct ucontext_extended describes how the registers are stored + * on a rt signal frame. Please note that the structure is not fixed, + * if new CPU registers are added to the user state the size of the + * struct ucontext_extended will increase. + */ struct ucontext_extended { unsigned long uc_flags; struct ucontext *uc_link; @@ -19,11 +24,9 @@ struct ucontext_extended { sigset_t uc_sigmask; /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(sigset_t)]; - unsigned long uc_gprs_high[16]; + _sigregs_ext uc_mcontext_ext; }; -#endif - struct ucontext { unsigned long uc_flags; struct ucontext *uc_link; diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h index 70d4b7c4beaa..a0a886c04977 100644 --- a/arch/s390/kernel/compat_linux.h +++ b/arch/s390/kernel/compat_linux.h @@ -50,6 +50,14 @@ typedef struct _s390_fp_regs32 fpregs; } _sigregs32; +typedef struct +{ + __u32 gprs_high[__NUM_GPRS]; + __u64 vxrs_low[__NUM_VXRS_LOW]; + __vector128 vxrs_high[__NUM_VXRS_HIGH]; + __u8 __reserved[128]; +} _sigregs_ext32; + #define _SIGCONTEXT_NSIG32 64 #define _SIGCONTEXT_NSIG_BPW32 32 #define __SIGNAL_FRAMESIZE32 96 @@ -72,6 +80,7 @@ struct ucontext32 { compat_sigset_t uc_sigmask; /* Allow for uc_sigmask growth. Glibc uses a 1024-bit sigset_t. */ unsigned char __unused[128 - sizeof(compat_sigset_t)]; + _sigregs_ext32 uc_mcontext_ext; }; struct stat64_emu31; diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 598b0b42668b..009f5eb11125 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -36,17 +36,16 @@ typedef struct struct sigcontext32 sc; _sigregs32 sregs; int signo; - __u32 gprs_high[NUM_GPRS]; - __u8 retcode[S390_SYSCALL_SIZE]; + _sigregs_ext32 sregs_ext; + __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ } sigframe32; typedef struct { __u8 callee_used_stack[__SIGNAL_FRAMESIZE32]; - __u8 retcode[S390_SYSCALL_SIZE]; + __u16 svc_insn; compat_siginfo_t info; struct ucontext32 uc; - __u32 gprs_high[NUM_GPRS]; } rt_sigframe32; int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) @@ -151,6 +150,38 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) return err ? -EFAULT : 0; } +/* Store registers needed to create the signal frame */ +static void store_sigregs(void) +{ + int i; + + save_access_regs(current->thread.acrs); + save_fp_ctl(¤t->thread.fp_regs.fpc); + if (current->thread.vxrs) { + save_vx_regs(current->thread.vxrs); + for (i = 0; i < __NUM_FPRS; i++) + current->thread.fp_regs.fprs[i] = + *(freg_t *)(current->thread.vxrs + i); + } else + save_fp_regs(current->thread.fp_regs.fprs); +} + +/* Load registers after signal return */ +static void load_sigregs(void) +{ + int i; + + restore_access_regs(current->thread.acrs); + /* restore_fp_ctl is done in restore_sigregs */ + if (current->thread.vxrs) { + for (i = 0; i < __NUM_FPRS; i++) + *(freg_t *)(current->thread.vxrs + i) = + current->thread.fp_regs.fprs[i]; + restore_vx_regs(current->thread.vxrs); + } else + restore_fp_regs(current->thread.fp_regs.fprs); +} + static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) { _sigregs32 user_sregs; @@ -163,11 +194,8 @@ static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) (__u32)(regs->psw.mask & PSW_MASK_BA); for (i = 0; i < NUM_GPRS; i++) user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; - save_access_regs(current->thread.acrs); memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - save_fp_ctl(¤t->thread.fp_regs.fpc); - save_fp_regs(current->thread.fp_regs.fprs); memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, sizeof(user_sregs.fpregs)); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) @@ -207,37 +235,67 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - restore_access_regs(current->thread.acrs); memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, sizeof(current->thread.fp_regs)); - restore_fp_regs(current->thread.fp_regs.fprs); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; } -static int save_sigregs_gprs_high(struct pt_regs *regs, __u32 __user *uregs) +static int save_sigregs_ext32(struct pt_regs *regs, + _sigregs_ext32 __user *sregs_ext) { __u32 gprs_high[NUM_GPRS]; + __u64 vxrs[__NUM_VXRS_LOW]; int i; + /* Save high gprs to signal stack */ for (i = 0; i < NUM_GPRS; i++) gprs_high[i] = regs->gprs[i] >> 32; - if (__copy_to_user(uregs, &gprs_high, sizeof(gprs_high))) + if (__copy_to_user(&sregs_ext->gprs_high, &gprs_high, + sizeof(sregs_ext->gprs_high))) return -EFAULT; + + /* Save vector registers to signal stack */ + if (current->thread.vxrs) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(current->thread.vxrs + i) + 1); + if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, + sizeof(sregs_ext->vxrs_low)) || + __copy_to_user(&sregs_ext->vxrs_high, + current->thread.vxrs + __NUM_VXRS_LOW, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + } return 0; } -static int restore_sigregs_gprs_high(struct pt_regs *regs, __u32 __user *uregs) +static int restore_sigregs_ext32(struct pt_regs *regs, + _sigregs_ext32 __user *sregs_ext) { __u32 gprs_high[NUM_GPRS]; + __u64 vxrs[__NUM_VXRS_LOW]; int i; - if (__copy_from_user(&gprs_high, uregs, sizeof(gprs_high))) + /* Restore high gprs from signal stack */ + if (__copy_from_user(&gprs_high, &sregs_ext->gprs_high, + sizeof(&sregs_ext->gprs_high))) return -EFAULT; for (i = 0; i < NUM_GPRS; i++) *(__u32 *)®s->gprs[i] = gprs_high[i]; + + /* Restore vector registers from signal stack */ + if (current->thread.vxrs) { + if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, + sizeof(sregs_ext->vxrs_low)) || + __copy_from_user(current->thread.vxrs + __NUM_VXRS_LOW, + &sregs_ext->vxrs_high, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + for (i = 0; i < __NUM_VXRS_LOW; i++) + *((__u64 *)(current->thread.vxrs + i) + 1) = vxrs[i]; + } return 0; } @@ -252,8 +310,9 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) set_current_blocked(&set); if (restore_sigregs32(regs, &frame->sregs)) goto badframe; - if (restore_sigregs_gprs_high(regs, frame->gprs_high)) + if (restore_sigregs_ext32(regs, &frame->sregs_ext)) goto badframe; + load_sigregs(); return regs->gprs[2]; badframe: force_sig(SIGSEGV, current); @@ -269,12 +328,13 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; set_current_blocked(&set); + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) goto badframe; - if (restore_sigregs_gprs_high(regs, frame->gprs_high)) + if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) goto badframe; - if (compat_restore_altstack(&frame->uc.uc_stack)) - goto badframe; + load_sigregs(); return regs->gprs[2]; badframe: force_sig(SIGSEGV, current); @@ -324,37 +384,64 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { int sig = ksig->sig; - sigframe32 __user *frame = get_sigframe(&ksig->ka, regs, sizeof(sigframe32)); - + sigframe32 __user *frame; + struct sigcontext32 sc; + unsigned long restorer; + size_t frame_size; + + /* + * gprs_high are always present for 31-bit compat tasks. + * The space for vector registers is only allocated if + * the machine supports it + */ + frame_size = sizeof(*frame) - sizeof(frame->sregs_ext.__reserved); + if (!MACHINE_HAS_VX) + frame_size -= sizeof(frame->sregs_ext.vxrs_low) + + sizeof(frame->sregs_ext.vxrs_high); + frame = get_sigframe(&ksig->ka, regs, frame_size); if (frame == (void __user *) -1UL) return -EFAULT; - if (__copy_to_user(&frame->sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE32)) + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (unsigned int __user *) frame)) + return -EFAULT; + + /* Create struct sigcontext32 on the signal stack */ + memcpy(&sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE32); + sc.sregs = (__u32)(unsigned long __force) &frame->sregs; + if (__copy_to_user(&frame->sc, &sc, sizeof(frame->sc))) return -EFAULT; + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create _sigregs32 on the signal stack */ if (save_sigregs32(regs, &frame->sregs)) return -EFAULT; - if (save_sigregs_gprs_high(regs, frame->gprs_high)) + + /* Place signal number on stack to allow backtrace from handler. */ + if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo)) return -EFAULT; - if (__put_user((unsigned long) &frame->sregs, &frame->sc.sregs)) + + /* Create _sigregs_ext32 on the signal stack */ + if (save_sigregs_ext32(regs, &frame->sregs_ext)) return -EFAULT; /* Set up to return from userspace. If provided, use a stub already in userspace. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) { - regs->gprs[14] = (__u64 __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; + restorer = (unsigned long __force) + ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - regs->gprs[14] = (__u64 __force) frame->retcode | PSW32_ADDR_AMODE; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, - (u16 __force __user *)(frame->retcode))) + /* Signal frames without vectors registers are short ! */ + __u16 __user *svc = (void *) frame + frame_size - 2; + if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) return -EFAULT; + restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; } - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (unsigned int __user *) frame)) - return -EFAULT; - /* Set up registers for signal handler */ + regs->gprs[14] = restorer; regs->gprs[15] = (__force __u64) frame; /* Force 31 bit amode and default user address space control. */ regs->psw.mask = PSW_MASK_BA | @@ -375,50 +462,69 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, regs->gprs[6] = task_thread_info(current)->last_break; } - /* Place signal number on stack to allow backtrace from handler. */ - if (__put_user(regs->gprs[2], (int __force __user *) &frame->signo)) - return -EFAULT; return 0; } static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { - int err = 0; - rt_sigframe32 __user *frame = get_sigframe(&ksig->ka, regs, sizeof(rt_sigframe32)); - + rt_sigframe32 __user *frame; + unsigned long restorer; + size_t frame_size; + u32 uc_flags; + + frame_size = sizeof(*frame) - + sizeof(frame->uc.uc_mcontext_ext.__reserved); + /* + * gprs_high are always present for 31-bit compat tasks. + * The space for vector registers is only allocated if + * the machine supports it + */ + uc_flags = UC_GPRS_HIGH; + if (MACHINE_HAS_VX) { + if (current->thread.vxrs) + uc_flags |= UC_VXRS; + } else + frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + + sizeof(frame->uc.uc_mcontext_ext.vxrs_high); + frame = get_sigframe(&ksig->ka, regs, frame_size); if (frame == (void __user *) -1UL) return -EFAULT; - if (copy_siginfo_to_user32(&frame->info, &ksig->info)) - return -EFAULT; - - /* Create the ucontext. */ - err |= __put_user(UC_EXTENDED, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]); - err |= save_sigregs32(regs, &frame->uc.uc_mcontext); - err |= save_sigregs_gprs_high(regs, frame->gprs_high); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame)) return -EFAULT; /* Set up to return from userspace. If provided, use a stub already in userspace. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) { - regs->gprs[14] = (__u64 __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; + restorer = (unsigned long __force) + ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - regs->gprs[14] = (__u64 __force) frame->retcode | PSW32_ADDR_AMODE; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, - (u16 __force __user *)(frame->retcode))) + __u16 __user *svc = &frame->svc_insn; + if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) return -EFAULT; + restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; } - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (unsigned int __force __user *) frame)) + /* Create siginfo on the signal stack */ + if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create ucontext on the signal stack. */ + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(0, &frame->uc.uc_link) || + __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs32(regs, &frame->uc.uc_mcontext) || + __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)) || + save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) return -EFAULT; /* Set up registers for signal handler */ + regs->gprs[14] = restorer; regs->gprs[15] = (__force __u64) frame; /* Force 31 bit amode and default user address space control. */ regs->psw.mask = PSW_MASK_BA | diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index f6c66b5d0cfa..cef2879edff3 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -392,6 +392,8 @@ static __init void detect_machine_facilities(void) S390_lowcore.machine_flags |= MACHINE_FLAG_TE; if (test_facility(51)) S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC; + if (test_facility(129)) + S390_lowcore.machine_flags |= MACHINE_FLAG_VX; #endif } diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index cd68869f9504..0554b9771c9f 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -21,6 +21,8 @@ void psw_idle(struct s390_idle_data *, unsigned long); asmlinkage long do_syscall_trace_enter(struct pt_regs *regs); asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); +int alloc_vector_registers(struct task_struct *tsk); + void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); @@ -43,6 +45,7 @@ void special_op_exception(struct pt_regs *regs); void specification_exception(struct pt_regs *regs); void transaction_exception(struct pt_regs *regs); void translation_exception(struct pt_regs *regs); +void vector_exception(struct pt_regs *regs); void do_per_trap(struct pt_regs *regs); void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 210e1285f75a..db96b418160a 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -20,6 +20,7 @@ #include #include #include +#include struct mcck_struct { int kill_task; @@ -163,6 +164,21 @@ static int notrace s390_revalidate_registers(struct mci *mci) " ld 15,120(%0)\n" : : "a" (fpt_save_area)); } + +#ifdef CONFIG_64BIT + /* Revalidate vector registers */ + if (MACHINE_HAS_VX && current->thread.vxrs) { + if (!mci->vr) { + /* + * Vector registers can't be restored and therefore + * the process needs to be terminated. + */ + kill_task = 1; + } + restore_vx_regs((__vector128 *) + S390_lowcore.vector_save_area_addr); + } +#endif /* Revalidate access registers */ asm volatile( " lam 0,15,0(%0)" diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S index 813ec7260878..f6f8886399f6 100644 --- a/arch/s390/kernel/pgm_check.S +++ b/arch/s390/kernel/pgm_check.S @@ -49,7 +49,7 @@ PGM_CHECK_DEFAULT /* 17 */ PGM_CHECK_64BIT(transaction_exception) /* 18 */ PGM_CHECK_DEFAULT /* 19 */ PGM_CHECK_DEFAULT /* 1a */ -PGM_CHECK_DEFAULT /* 1b */ +PGM_CHECK_64BIT(vector_exception) /* 1b */ PGM_CHECK(space_switch_exception) /* 1c */ PGM_CHECK(hfp_sqrt_exception) /* 1d */ PGM_CHECK_DEFAULT /* 1e */ diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 32587cc76306..edefead3b43a 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -39,7 +39,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) { static const char *hwcap_str[] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te" + "edat", "etf3eh", "highgprs", "te", "vx" }; unsigned long n = (unsigned long) v - 1; int i; diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index fe99d6b3f185..0ecfdb3c6f8e 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -38,15 +38,6 @@ #define CREATE_TRACE_POINTS #include -enum s390_regset { - REGSET_GENERAL, - REGSET_FP, - REGSET_LAST_BREAK, - REGSET_TDB, - REGSET_SYSTEM_CALL, - REGSET_GENERAL_EXTENDED, -}; - void update_cr_regs(struct task_struct *task) { struct pt_regs *regs = task_pt_regs(task); @@ -55,27 +46,39 @@ void update_cr_regs(struct task_struct *task) #ifdef CONFIG_64BIT /* Take care of the enable/disable of transactional execution. */ - if (MACHINE_HAS_TE) { + if (MACHINE_HAS_TE || MACHINE_HAS_VX) { unsigned long cr, cr_new; __ctl_store(cr, 0, 0); - /* Set or clear transaction execution TXC bit 8. */ - cr_new = cr | (1UL << 55); - if (task->thread.per_flags & PER_FLAG_NO_TE) - cr_new &= ~(1UL << 55); + cr_new = cr; + if (MACHINE_HAS_TE) { + /* Set or clear transaction execution TXC bit 8. */ + cr_new |= (1UL << 55); + if (task->thread.per_flags & PER_FLAG_NO_TE) + cr_new &= ~(1UL << 55); + } + if (MACHINE_HAS_VX) { + /* Enable/disable of vector extension */ + cr_new &= ~(1UL << 17); + if (task->thread.vxrs) + cr_new |= (1UL << 17); + } if (cr_new != cr) __ctl_load(cr_new, 0, 0); - /* Set or clear transaction execution TDC bits 62 and 63. */ - __ctl_store(cr, 2, 2); - cr_new = cr & ~3UL; - if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { - if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND) - cr_new |= 1UL; - else - cr_new |= 2UL; + if (MACHINE_HAS_TE) { + /* Set/clear transaction execution TDC bits 62/63. */ + __ctl_store(cr, 2, 2); + cr_new = cr & ~3UL; + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { + if (task->thread.per_flags & + PER_FLAG_TE_ABORT_RAND_TEND) + cr_new |= 1UL; + else + cr_new |= 2UL; + } + if (cr_new != cr) + __ctl_load(cr_new, 2, 2); } - if (cr_new != cr) - __ctl_load(cr_new, 2, 2); } #endif /* Copy user specified PER registers */ @@ -926,7 +929,15 @@ static int s390_fpregs_get(struct task_struct *target, save_fp_ctl(&target->thread.fp_regs.fpc); save_fp_regs(target->thread.fp_regs.fprs); } +#ifdef CONFIG_64BIT + else if (target->thread.vxrs) { + int i; + for (i = 0; i < __NUM_VXRS_LOW; i++) + target->thread.fp_regs.fprs[i] = + *(freg_t *)(target->thread.vxrs + i); + } +#endif return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fp_regs, 0, -1); } @@ -960,9 +971,20 @@ static int s390_fpregs_set(struct task_struct *target, target->thread.fp_regs.fprs, offsetof(s390_fp_regs, fprs), -1); - if (rc == 0 && target == current) { - restore_fp_ctl(&target->thread.fp_regs.fpc); - restore_fp_regs(target->thread.fp_regs.fprs); + if (rc == 0) { + if (target == current) { + restore_fp_ctl(&target->thread.fp_regs.fpc); + restore_fp_regs(target->thread.fp_regs.fprs); + } +#ifdef CONFIG_64BIT + else if (target->thread.vxrs) { + int i; + + for (i = 0; i < __NUM_VXRS_LOW; i++) + *(freg_t *)(target->thread.vxrs + i) = + target->thread.fp_regs.fprs[i]; + } +#endif } return rc; @@ -1018,6 +1040,95 @@ static int s390_tdb_set(struct task_struct *target, return 0; } +static int s390_vxrs_active(struct task_struct *target, + const struct user_regset *regset) +{ + return !!target->thread.vxrs; +} + +static int s390_vxrs_low_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + if (target->thread.vxrs) { + if (target == current) + save_vx_regs(target->thread.vxrs); + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(target->thread.vxrs + i) + 1); + } else + memset(vxrs, 0, sizeof(vxrs)); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); +} + +static int s390_vxrs_low_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + __u64 vxrs[__NUM_VXRS_LOW]; + int i, rc; + + if (!target->thread.vxrs) { + rc = alloc_vector_registers(target); + if (rc) + return rc; + } else if (target == current) + save_vx_regs(target->thread.vxrs); + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); + if (rc == 0) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + *((__u64 *)(target->thread.vxrs + i) + 1) = vxrs[i]; + if (target == current) + restore_vx_regs(target->thread.vxrs); + } + + return rc; +} + +static int s390_vxrs_high_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + __vector128 vxrs[__NUM_VXRS_HIGH]; + + if (target->thread.vxrs) { + if (target == current) + save_vx_regs(target->thread.vxrs); + memcpy(vxrs, target->thread.vxrs + __NUM_VXRS_LOW, + sizeof(vxrs)); + } else + memset(vxrs, 0, sizeof(vxrs)); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); +} + +static int s390_vxrs_high_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int rc; + + if (!target->thread.vxrs) { + rc = alloc_vector_registers(target); + if (rc) + return rc; + } else if (target == current) + save_vx_regs(target->thread.vxrs); + + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + target->thread.vxrs + __NUM_VXRS_LOW, 0, -1); + if (rc == 0 && target == current) + restore_vx_regs(target->thread.vxrs); + + return rc; +} + #endif static int s390_system_call_get(struct task_struct *target, @@ -1041,7 +1152,7 @@ static int s390_system_call_set(struct task_struct *target, } static const struct user_regset s390_regsets[] = { - [REGSET_GENERAL] = { + { .core_note_type = NT_PRSTATUS, .n = sizeof(s390_regs) / sizeof(long), .size = sizeof(long), @@ -1049,7 +1160,7 @@ static const struct user_regset s390_regsets[] = { .get = s390_regs_get, .set = s390_regs_set, }, - [REGSET_FP] = { + { .core_note_type = NT_PRFPREG, .n = sizeof(s390_fp_regs) / sizeof(long), .size = sizeof(long), @@ -1057,8 +1168,16 @@ static const struct user_regset s390_regsets[] = { .get = s390_fpregs_get, .set = s390_fpregs_set, }, + { + .core_note_type = NT_S390_SYSTEM_CALL, + .n = 1, + .size = sizeof(unsigned int), + .align = sizeof(unsigned int), + .get = s390_system_call_get, + .set = s390_system_call_set, + }, #ifdef CONFIG_64BIT - [REGSET_LAST_BREAK] = { + { .core_note_type = NT_S390_LAST_BREAK, .n = 1, .size = sizeof(long), @@ -1066,7 +1185,7 @@ static const struct user_regset s390_regsets[] = { .get = s390_last_break_get, .set = s390_last_break_set, }, - [REGSET_TDB] = { + { .core_note_type = NT_S390_TDB, .n = 1, .size = 256, @@ -1074,15 +1193,25 @@ static const struct user_regset s390_regsets[] = { .get = s390_tdb_get, .set = s390_tdb_set, }, -#endif - [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_S390_SYSTEM_CALL, - .n = 1, - .size = sizeof(unsigned int), - .align = sizeof(unsigned int), - .get = s390_system_call_get, - .set = s390_system_call_set, + { + .core_note_type = NT_S390_VXRS_LOW, + .n = __NUM_VXRS_LOW, + .size = sizeof(__u64), + .align = sizeof(__u64), + .active = s390_vxrs_active, + .get = s390_vxrs_low_get, + .set = s390_vxrs_low_set, + }, + { + .core_note_type = NT_S390_VXRS_HIGH, + .n = __NUM_VXRS_HIGH, + .size = sizeof(__vector128), + .align = sizeof(__vector128), + .active = s390_vxrs_active, + .get = s390_vxrs_high_get, + .set = s390_vxrs_high_set, }, +#endif }; static const struct user_regset_view user_s390_view = { @@ -1247,7 +1376,7 @@ static int s390_compat_last_break_set(struct task_struct *target, } static const struct user_regset s390_compat_regsets[] = { - [REGSET_GENERAL] = { + { .core_note_type = NT_PRSTATUS, .n = sizeof(s390_compat_regs) / sizeof(compat_long_t), .size = sizeof(compat_long_t), @@ -1255,7 +1384,7 @@ static const struct user_regset s390_compat_regsets[] = { .get = s390_compat_regs_get, .set = s390_compat_regs_set, }, - [REGSET_FP] = { + { .core_note_type = NT_PRFPREG, .n = sizeof(s390_fp_regs) / sizeof(compat_long_t), .size = sizeof(compat_long_t), @@ -1263,7 +1392,15 @@ static const struct user_regset s390_compat_regsets[] = { .get = s390_fpregs_get, .set = s390_fpregs_set, }, - [REGSET_LAST_BREAK] = { + { + .core_note_type = NT_S390_SYSTEM_CALL, + .n = 1, + .size = sizeof(compat_uint_t), + .align = sizeof(compat_uint_t), + .get = s390_system_call_get, + .set = s390_system_call_set, + }, + { .core_note_type = NT_S390_LAST_BREAK, .n = 1, .size = sizeof(long), @@ -1271,7 +1408,7 @@ static const struct user_regset s390_compat_regsets[] = { .get = s390_compat_last_break_get, .set = s390_compat_last_break_set, }, - [REGSET_TDB] = { + { .core_note_type = NT_S390_TDB, .n = 1, .size = 256, @@ -1279,15 +1416,25 @@ static const struct user_regset s390_compat_regsets[] = { .get = s390_tdb_get, .set = s390_tdb_set, }, - [REGSET_SYSTEM_CALL] = { - .core_note_type = NT_S390_SYSTEM_CALL, - .n = 1, - .size = sizeof(compat_uint_t), - .align = sizeof(compat_uint_t), - .get = s390_system_call_get, - .set = s390_system_call_set, + { + .core_note_type = NT_S390_VXRS_LOW, + .n = __NUM_VXRS_LOW, + .size = sizeof(__u64), + .align = sizeof(__u64), + .active = s390_vxrs_active, + .get = s390_vxrs_low_get, + .set = s390_vxrs_low_set, + }, + { + .core_note_type = NT_S390_VXRS_HIGH, + .n = __NUM_VXRS_HIGH, + .size = sizeof(__vector128), + .align = sizeof(__vector128), + .active = s390_vxrs_active, + .get = s390_vxrs_high_get, + .set = s390_vxrs_high_set, }, - [REGSET_GENERAL_EXTENDED] = { + { .core_note_type = NT_S390_HIGH_GPRS, .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), .size = sizeof(compat_long_t), diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index cdfc060dd319..e80d9ff9a56d 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -343,6 +343,9 @@ static void __init setup_lowcore(void) __ctl_set_bit(14, 29); } #else + if (MACHINE_HAS_VX) + lc->vector_save_area_addr = + (unsigned long) &lc->vector_save_area; lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0]; #endif lc->sync_enter_timer = S390_lowcore.sync_enter_timer; @@ -765,6 +768,12 @@ static void __init setup_hwcaps(void) */ if (test_facility(50) && test_facility(73)) elf_hwcap |= HWCAP_S390_TE; + + /* + * Vector extension HWCAP_S390_VXRS is bit 11. + */ + if (test_facility(129)) + elf_hwcap |= HWCAP_S390_VXRS; #endif get_cpu_id(&cpu_id); diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 469c4c6d9182..0c1a0ff0a558 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -31,30 +31,117 @@ #include #include "entry.h" -typedef struct +/* + * Layout of an old-style signal-frame: + * ----------------------------------------- + * | save area (_SIGNAL_FRAMESIZE) | + * ----------------------------------------- + * | struct sigcontext | + * | oldmask | + * | _sigregs * | + * ----------------------------------------- + * | _sigregs with | + * | _s390_regs_common | + * | _s390_fp_regs | + * ----------------------------------------- + * | int signo | + * ----------------------------------------- + * | _sigregs_ext with | + * | gprs_high 64 byte (opt) | + * | vxrs_low 128 byte (opt) | + * | vxrs_high 256 byte (opt) | + * | reserved 128 byte (opt) | + * ----------------------------------------- + * | __u16 svc_insn | + * ----------------------------------------- + * The svc_insn entry with the sigreturn system call opcode does not + * have a fixed position and moves if gprs_high or vxrs exist. + * Future extensions will be added to _sigregs_ext. + */ +struct sigframe { __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; struct sigcontext sc; _sigregs sregs; int signo; - __u8 retcode[S390_SYSCALL_SIZE]; -} sigframe; + _sigregs_ext sregs_ext; + __u16 svc_insn; /* Offset of svc_insn is NOT fixed! */ +}; -typedef struct +/* + * Layout of an rt signal-frame: + * ----------------------------------------- + * | save area (_SIGNAL_FRAMESIZE) | + * ----------------------------------------- + * | svc __NR_rt_sigreturn 2 byte | + * ----------------------------------------- + * | struct siginfo | + * ----------------------------------------- + * | struct ucontext_extended with | + * | unsigned long uc_flags | + * | struct ucontext *uc_link | + * | stack_t uc_stack | + * | _sigregs uc_mcontext with | + * | _s390_regs_common | + * | _s390_fp_regs | + * | sigset_t uc_sigmask | + * | _sigregs_ext uc_mcontext_ext | + * | gprs_high 64 byte (opt) | + * | vxrs_low 128 byte (opt) | + * | vxrs_high 256 byte (opt)| + * | reserved 128 byte (opt) | + * ----------------------------------------- + * Future extensions will be added to _sigregs_ext. + */ +struct rt_sigframe { __u8 callee_used_stack[__SIGNAL_FRAMESIZE]; - __u8 retcode[S390_SYSCALL_SIZE]; + __u16 svc_insn; struct siginfo info; - struct ucontext uc; -} rt_sigframe; + struct ucontext_extended uc; +}; + +/* Store registers needed to create the signal frame */ +static void store_sigregs(void) +{ + save_access_regs(current->thread.acrs); + save_fp_ctl(¤t->thread.fp_regs.fpc); +#ifdef CONFIG_64BIT + if (current->thread.vxrs) { + int i; + + save_vx_regs(current->thread.vxrs); + for (i = 0; i < __NUM_FPRS; i++) + current->thread.fp_regs.fprs[i] = + *(freg_t *)(current->thread.vxrs + i); + } else +#endif + save_fp_regs(current->thread.fp_regs.fprs); +} + +/* Load registers after signal return */ +static void load_sigregs(void) +{ + restore_access_regs(current->thread.acrs); + /* restore_fp_ctl is done in restore_sigregs */ +#ifdef CONFIG_64BIT + if (current->thread.vxrs) { + int i; + + for (i = 0; i < __NUM_FPRS; i++) + *(freg_t *)(current->thread.vxrs + i) = + current->thread.fp_regs.fprs[i]; + restore_vx_regs(current->thread.vxrs); + } else +#endif + restore_fp_regs(current->thread.fp_regs.fprs); +} /* Returns non-zero on fault. */ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) { _sigregs user_sregs; - save_access_regs(current->thread.acrs); - /* Copy a 'clean' PSW mask to the user to avoid leaking information about whether PER is currently on. */ user_sregs.regs.psw.mask = PSW_USER_BITS | @@ -63,12 +150,6 @@ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - /* - * We have to store the fp registers to current->thread.fp_regs - * to merge them with the emulated registers. - */ - save_fp_ctl(¤t->thread.fp_regs.fpc); - save_fp_regs(current->thread.fp_regs.fprs); memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, sizeof(user_sregs.fpregs)); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) @@ -107,20 +188,64 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(®s->gprs, &user_sregs.regs.gprs, sizeof(sregs->regs.gprs)); memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - restore_access_regs(current->thread.acrs); memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, sizeof(current->thread.fp_regs)); - restore_fp_regs(current->thread.fp_regs.fprs); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; } +/* Returns non-zero on fault. */ +static int save_sigregs_ext(struct pt_regs *regs, + _sigregs_ext __user *sregs_ext) +{ +#ifdef CONFIG_64BIT + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Save vector registers to signal stack */ + if (current->thread.vxrs) { + for (i = 0; i < __NUM_VXRS_LOW; i++) + vxrs[i] = *((__u64 *)(current->thread.vxrs + i) + 1); + if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, + sizeof(sregs_ext->vxrs_low)) || + __copy_to_user(&sregs_ext->vxrs_high, + current->thread.vxrs + __NUM_VXRS_LOW, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + } +#endif + return 0; +} + +static int restore_sigregs_ext(struct pt_regs *regs, + _sigregs_ext __user *sregs_ext) +{ +#ifdef CONFIG_64BIT + __u64 vxrs[__NUM_VXRS_LOW]; + int i; + + /* Restore vector registers from signal stack */ + if (current->thread.vxrs) { + if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, + sizeof(sregs_ext->vxrs_low)) || + __copy_from_user(current->thread.vxrs + __NUM_VXRS_LOW, + &sregs_ext->vxrs_high, + sizeof(sregs_ext->vxrs_high))) + return -EFAULT; + for (i = 0; i < __NUM_VXRS_LOW; i++) + *((__u64 *)(current->thread.vxrs + i) + 1) = vxrs[i]; + } +#endif + return 0; +} + SYSCALL_DEFINE0(sigreturn) { struct pt_regs *regs = task_pt_regs(current); - sigframe __user *frame = (sigframe __user *)regs->gprs[15]; + struct sigframe __user *frame = + (struct sigframe __user *) regs->gprs[15]; sigset_t set; if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) @@ -128,6 +253,9 @@ SYSCALL_DEFINE0(sigreturn) set_current_blocked(&set); if (restore_sigregs(regs, &frame->sregs)) goto badframe; + if (restore_sigregs_ext(regs, &frame->sregs_ext)) + goto badframe; + load_sigregs(); return regs->gprs[2]; badframe: force_sig(SIGSEGV, current); @@ -137,27 +265,26 @@ badframe: SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = task_pt_regs(current); - rt_sigframe __user *frame = (rt_sigframe __user *)regs->gprs[15]; + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *)regs->gprs[15]; sigset_t set; if (__copy_from_user(&set.sig, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; set_current_blocked(&set); + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; if (restore_sigregs(regs, &frame->uc.uc_mcontext)) goto badframe; - if (restore_altstack(&frame->uc.uc_stack)) + if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) goto badframe; + load_sigregs(); return regs->gprs[2]; badframe: force_sig(SIGSEGV, current); return 0; } -/* - * Set up a signal frame. - */ - - /* * Determine which stack to use.. */ @@ -195,39 +322,63 @@ static inline int map_signal(int sig) static int setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs) { - sigframe __user *frame; - - frame = get_sigframe(ka, regs, sizeof(sigframe)); + struct sigframe __user *frame; + struct sigcontext sc; + unsigned long restorer; + size_t frame_size; + /* + * gprs_high are only present for a 31-bit task running on + * a 64-bit kernel (see compat_signal.c) but the space for + * gprs_high need to be allocated if vector registers are + * included in the signal frame on a 31-bit system. + */ + frame_size = sizeof(*frame) - sizeof(frame->sregs_ext); + if (MACHINE_HAS_VX) + frame_size += sizeof(frame->sregs_ext); + frame = get_sigframe(ka, regs, frame_size); if (frame == (void __user *) -1UL) return -EFAULT; - if (__copy_to_user(&frame->sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE)) + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (addr_t __user *) frame)) return -EFAULT; + /* Create struct sigcontext on the signal stack */ + memcpy(&sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE); + sc.sregs = (_sigregs __user __force *) &frame->sregs; + if (__copy_to_user(&frame->sc, &sc, sizeof(frame->sc))) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create _sigregs on the signal stack */ if (save_sigregs(regs, &frame->sregs)) return -EFAULT; - if (__put_user(&frame->sregs, &frame->sc.sregs)) + + /* Place signal number on stack to allow backtrace from handler. */ + if (__put_user(regs->gprs[2], (int __user *) &frame->signo)) + return -EFAULT; + + /* Create _sigregs_ext on the signal stack */ + if (save_sigregs_ext(regs, &frame->sregs_ext)) return -EFAULT; /* Set up to return from userspace. If provided, use a stub already in userspace. */ if (ka->sa.sa_flags & SA_RESTORER) { - regs->gprs[14] = (unsigned long) - ka->sa.sa_restorer | PSW_ADDR_AMODE; + restorer = (unsigned long) ka->sa.sa_restorer | PSW_ADDR_AMODE; } else { - regs->gprs[14] = (unsigned long) - frame->retcode | PSW_ADDR_AMODE; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, - (u16 __user *)(frame->retcode))) + /* Signal frame without vector registers are short ! */ + __u16 __user *svc = (void *) frame + frame_size - 2; + if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) return -EFAULT; + restorer = (unsigned long) svc | PSW_ADDR_AMODE; } - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (addr_t __user *) frame)) - return -EFAULT; - /* Set up registers for signal handler */ + regs->gprs[14] = restorer; regs->gprs[15] = (unsigned long) frame; /* Force default amode and default user address space control. */ regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA | @@ -247,54 +398,69 @@ static int setup_frame(int sig, struct k_sigaction *ka, regs->gprs[5] = regs->int_parm_long; regs->gprs[6] = task_thread_info(current)->last_break; } - - /* Place signal number on stack to allow backtrace from handler. */ - if (__put_user(regs->gprs[2], (int __user *) &frame->signo)) - return -EFAULT; return 0; } static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { - int err = 0; - rt_sigframe __user *frame; - - frame = get_sigframe(&ksig->ka, regs, sizeof(rt_sigframe)); + struct rt_sigframe __user *frame; + unsigned long uc_flags, restorer; + size_t frame_size; + frame_size = sizeof(struct rt_sigframe) - sizeof(_sigregs_ext); + /* + * gprs_high are only present for a 31-bit task running on + * a 64-bit kernel (see compat_signal.c) but the space for + * gprs_high need to be allocated if vector registers are + * included in the signal frame on a 31-bit system. + */ + uc_flags = 0; +#ifdef CONFIG_64BIT + if (MACHINE_HAS_VX) { + frame_size += sizeof(_sigregs_ext); + if (current->thread.vxrs) + uc_flags |= UC_VXRS; + } +#endif + frame = get_sigframe(&ksig->ka, regs, frame_size); if (frame == (void __user *) -1UL) return -EFAULT; - if (copy_siginfo_to_user(&frame->info, &ksig->info)) - return -EFAULT; - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(NULL, &frame->uc.uc_link); - err |= __save_altstack(&frame->uc.uc_stack, regs->gprs[15]); - err |= save_sigregs(regs, &frame->uc.uc_mcontext); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) + /* Set up backchain. */ + if (__put_user(regs->gprs[15], (addr_t __user *) frame)) return -EFAULT; /* Set up to return from userspace. If provided, use a stub already in userspace. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) { - regs->gprs[14] = (unsigned long) + restorer = (unsigned long) ksig->ka.sa.sa_restorer | PSW_ADDR_AMODE; } else { - regs->gprs[14] = (unsigned long) - frame->retcode | PSW_ADDR_AMODE; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, - (u16 __user *)(frame->retcode))) + __u16 __user *svc = &frame->svc_insn; + if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) return -EFAULT; + restorer = (unsigned long) svc | PSW_ADDR_AMODE; } - /* Set up backchain. */ - if (__put_user(regs->gprs[15], (addr_t __user *) frame)) + /* Create siginfo on the signal stack */ + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + + /* Store registers needed to create the signal frame */ + store_sigregs(); + + /* Create ucontext on the signal stack. */ + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(NULL, &frame->uc.uc_link) || + __save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs(regs, &frame->uc.uc_mcontext) || + __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)) || + save_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) return -EFAULT; /* Set up registers for signal handler */ + regs->gprs[14] = restorer; regs->gprs[15] = (unsigned long) frame; /* Force default amode and default user address space control. */ regs->psw.mask = PSW_MASK_EA | PSW_MASK_BA | diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index bba0e2469254..13cae5b152d7 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -179,6 +179,9 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) goto out; } #else + if (MACHINE_HAS_VX) + lc->vector_save_area_addr = + (unsigned long) &lc->vector_save_area; if (vdso_alloc_per_cpu(lc)) goto out; #endif diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index e3e06a4fdfce..9ff5ecba26ab 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "entry.h" int show_unhandled_signals = 1; @@ -303,6 +305,74 @@ DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, "specification exception"); #endif +#ifdef CONFIG_64BIT +int alloc_vector_registers(struct task_struct *tsk) +{ + __vector128 *vxrs; + int i; + + /* Allocate vector register save area. */ + vxrs = kzalloc(sizeof(__vector128) * __NUM_VXRS, + GFP_KERNEL|__GFP_REPEAT); + if (!vxrs) + return -ENOMEM; + preempt_disable(); + if (tsk == current) + save_fp_regs(tsk->thread.fp_regs.fprs); + /* Copy the 16 floating point registers */ + for (i = 0; i < 16; i++) + *(freg_t *) &vxrs[i] = tsk->thread.fp_regs.fprs[i]; + tsk->thread.vxrs = vxrs; + if (tsk == current) { + __ctl_set_bit(0, 17); + restore_vx_regs(vxrs); + } + preempt_enable(); + return 0; +} + +void vector_exception(struct pt_regs *regs) +{ + int si_code, vic; + + if (!MACHINE_HAS_VX) { + do_trap(regs, SIGILL, ILL_ILLOPN, "illegal operation"); + return; + } + + /* get vector interrupt code from fpc */ + asm volatile("stfpc %0" : "=m" (current->thread.fp_regs.fpc)); + vic = (current->thread.fp_regs.fpc & 0xf00) >> 8; + switch (vic) { + case 1: /* invalid vector operation */ + si_code = FPE_FLTINV; + break; + case 2: /* division by zero */ + si_code = FPE_FLTDIV; + break; + case 3: /* overflow */ + si_code = FPE_FLTOVF; + break; + case 4: /* underflow */ + si_code = FPE_FLTUND; + break; + case 5: /* inexact */ + si_code = FPE_FLTRES; + break; + default: /* unknown cause */ + si_code = 0; + } + do_trap(regs, SIGFPE, si_code, "vector exception"); +} + +static int __init disable_vector_extension(char *str) +{ + S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; + return 1; +} +__setup("novx", disable_vector_extension); +#endif + void data_exception(struct pt_regs *regs) { __u16 __user *location; @@ -368,6 +438,18 @@ void data_exception(struct pt_regs *regs) } } #endif +#ifdef CONFIG_64BIT + /* Check for vector register enablement */ + if (MACHINE_HAS_VX && !current->thread.vxrs && + (current->thread.fp_regs.fpc & FPC_DXC_MASK) == 0xfe00) { + alloc_vector_registers(current); + /* Vector data exception is suppressing, rewind psw. */ + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); + clear_pt_regs_flag(regs, PIF_PER_TRAP); + return; + } +#endif + if (current->thread.fp_regs.fpc & FPC_DXC_MASK) signal = SIGFPE; else diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index ef6103bf1f9b..ea9bf2561b9e 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -391,6 +391,8 @@ typedef struct elf64_shdr { #define NT_S390_LAST_BREAK 0x306 /* s390 breaking event address */ #define NT_S390_SYSTEM_CALL 0x307 /* s390 system call restart data */ #define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ +#define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ +#define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ #define NT_ARM_TLS 0x401 /* ARM TLS register */ #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ -- cgit v1.2.3 From 66be7d2bcd826344894be09dc385f9f805136b84 Mon Sep 17 00:00:00 2001 From: Henning Rogge Date: Fri, 12 Sep 2014 08:58:49 +0200 Subject: cfg80211: add ops to query mesh proxy path table Add two new cfg80211 operations for querying a table with proxied mesh paths. Signed-off-by: Henning Rogge Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 ++++ include/uapi/linux/nl80211.h | 6 +++ net/wireless/nl80211.c | 99 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 27 +++++++++++- net/wireless/trace.h | 45 ++++++++++++++++++++ 5 files changed, 183 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a2ddcf2398fd..3f3aaa06adb5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2146,6 +2146,8 @@ struct cfg80211_qos_map { * @change_mpath: change a given mesh path * @get_mpath: get a mesh path for the given parameters * @dump_mpath: dump mesh path callback -- resume dump at index @idx + * @get_mpp: get a mesh proxy path for the given parameters + * @dump_mpp: dump mesh proxy path callback -- resume dump at index @idx * @join_mesh: join the mesh network with the specified parameters * (invoked with the wireless_dev mutex held) * @leave_mesh: leave the current mesh network @@ -2396,6 +2398,11 @@ struct cfg80211_ops { int (*dump_mpath)(struct wiphy *wiphy, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo); + int (*get_mpp)(struct wiphy *wiphy, struct net_device *dev, + u8 *dst, u8 *mpp, struct mpath_info *pinfo); + int (*dump_mpp)(struct wiphy *wiphy, struct net_device *dev, + int idx, u8 *dst, u8 *mpp, + struct mpath_info *pinfo); int (*get_mesh_config)(struct wiphy *wiphy, struct net_device *dev, struct mesh_config *conf); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4b28dc07bcb1..846071b0cde9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -738,6 +738,10 @@ * before removing a station entry entirely, or before disassociating * or similar, cleanup will happen in the driver/device in this case. * + * @NL80211_CMD_GET_MPP: Get mesh path attributes for mesh proxy path to + * destination %NL80211_ATTR_MAC on the interface identified by + * %NL80211_ATTR_IFINDEX. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -912,6 +916,8 @@ enum nl80211_commands { NL80211_CMD_ADD_TX_TS, NL80211_CMD_DEL_TX_TS, + NL80211_CMD_GET_MPP, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cb9f5a44ffad..d527aa0706c1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4624,6 +4624,96 @@ static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) return rdev_del_mpath(rdev, dev, dst); } +static int nl80211_get_mpp(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + int err; + struct net_device *dev = info->user_ptr[1]; + struct mpath_info pinfo; + struct sk_buff *msg; + u8 *dst = NULL; + u8 mpp[ETH_ALEN]; + + memset(&pinfo, 0, sizeof(pinfo)); + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + dst = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (!rdev->ops->get_mpp) + return -EOPNOTSUPP; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; + + err = rdev_get_mpp(rdev, dev, dst, mpp, &pinfo); + if (err) + return err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + if (nl80211_send_mpath(msg, info->snd_portid, info->snd_seq, 0, + dev, dst, mpp, &pinfo) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } + + return genlmsg_reply(msg, info); +} + +static int nl80211_dump_mpp(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct mpath_info pinfo; + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + u8 dst[ETH_ALEN]; + u8 mpp[ETH_ALEN]; + int path_idx = cb->args[2]; + int err; + + err = nl80211_prepare_wdev_dump(skb, cb, &rdev, &wdev); + if (err) + return err; + + if (!rdev->ops->dump_mpp) { + err = -EOPNOTSUPP; + goto out_err; + } + + if (wdev->iftype != NL80211_IFTYPE_MESH_POINT) { + err = -EOPNOTSUPP; + goto out_err; + } + + while (1) { + err = rdev_dump_mpp(rdev, wdev->netdev, path_idx, dst, + mpp, &pinfo); + if (err == -ENOENT) + break; + if (err) + goto out_err; + + if (nl80211_send_mpath(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + wdev->netdev, dst, mpp, + &pinfo) < 0) + goto out; + + path_idx++; + } + + out: + cb->args[2] = path_idx; + err = skb->len; + out_err: + nl80211_finish_wdev_dump(rdev); + return err; +} + static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -9773,6 +9863,15 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_GET_MPP, + .doit = nl80211_get_mpp, + .dumpit = nl80211_dump_mpp, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, { .cmd = NL80211_CMD_SET_MPATH, .doit = nl80211_set_mpath, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index f6d457d6a558..c09e697bcb15 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -263,6 +263,18 @@ static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev, } +static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev, + struct net_device *dev, u8 *dst, u8 *mpp, + struct mpath_info *pinfo) +{ + int ret; + + trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp); + ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo); + trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); + return ret; +} + static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) @@ -271,7 +283,20 @@ static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, int ret; trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop); ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop, - pinfo); + pinfo); + trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); + return ret; +} + +static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev, + struct net_device *dev, int idx, u8 *dst, + u8 *mpp, struct mpath_info *pinfo) + +{ + int ret; + + trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp); + ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 625a6e6d1168..8e4f8f04332d 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -801,6 +801,51 @@ TRACE_EVENT(rdev_dump_mpath, MAC_PR_ARG(next_hop)) ); +TRACE_EVENT(rdev_get_mpp, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + u8 *dst, u8 *mpp), + TP_ARGS(wiphy, netdev, dst, mpp), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dst) + MAC_ENTRY(mpp) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dst, dst); + MAC_ASSIGN(mpp, mpp); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", destination: " MAC_PR_FMT + ", mpp: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, + MAC_PR_ARG(dst), MAC_PR_ARG(mpp)) +); + +TRACE_EVENT(rdev_dump_mpp, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, int idx, + u8 *dst, u8 *mpp), + TP_ARGS(wiphy, netdev, idx, mpp, dst), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(dst) + MAC_ENTRY(mpp) + __field(int, idx) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(dst, dst); + MAC_ASSIGN(mpp, mpp); + __entry->idx = idx; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", index: %d, destination: " + MAC_PR_FMT ", mpp: " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->idx, MAC_PR_ARG(dst), + MAC_PR_ARG(mpp)) +); + TRACE_EVENT(rdev_return_int_mpath_info, TP_PROTO(struct wiphy *wiphy, int ret, struct mpath_info *pinfo), TP_ARGS(wiphy, ret, pinfo), -- cgit v1.2.3 From f606b77f1a9e362451aca8f81d8f36a3a112139e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:37 -0700 Subject: prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation During development of c/r we've noticed that in case if we need to support user namespaces we face a problem with capabilities in prctl(PR_SET_MM, ...) call, in particular once new user namespace is created capable(CAP_SYS_RESOURCE) no longer passes. A approach is to eliminate CAP_SYS_RESOURCE check but pass all new values in one bundle, which would allow the kernel to make more intensive test for sanity of values and same time allow us to support checkpoint/restore of user namespaces. Thus a new command PR_SET_MM_MAP introduced. It takes a pointer of prctl_mm_map structure which carries all the members to be updated. prctl(PR_SET_MM, PR_SET_MM_MAP, struct prctl_mm_map *, size) struct prctl_mm_map { __u64 start_code; __u64 end_code; __u64 start_data; __u64 end_data; __u64 start_brk; __u64 brk; __u64 start_stack; __u64 arg_start; __u64 arg_end; __u64 env_start; __u64 env_end; __u64 *auxv; __u32 auxv_size; __u32 exe_fd; }; All members except @exe_fd correspond ones of struct mm_struct. To figure out which available values these members may take here are meanings of the members. - start_code, end_code: represent bounds of executable code area - start_data, end_data: represent bounds of data area - start_brk, brk: used to calculate bounds for brk() syscall - start_stack: used when accounting space needed for command line arguments, environment and shmat() syscall - arg_start, arg_end, env_start, env_end: represent memory area supplied for command line arguments and environment variables - auxv, auxv_size: carries auxiliary vector, Elf format specifics - exe_fd: file descriptor number for executable link (/proc/self/exe) Thus we apply the following requirements to the values 1) Any member except @auxv, @auxv_size, @exe_fd is rather an address in user space thus it must be laying inside [mmap_min_addr, mmap_max_addr) interval. 2) While @[start|end]_code and @[start|end]_data may point to an nonexisting VMAs (say a program maps own new .text and .data segments during execution) the rest of members should belong to VMA which must exist. 3) Addresses must be ordered, ie @start_ member must not be greater or equal to appropriate @end_ member. 4) As in regular Elf loading procedure we require that @start_brk and @brk be greater than @end_data. 5) If RLIMIT_DATA rlimit is set to non-infinity new values should not exceed existing limit. Same applies to RLIMIT_STACK. 6) Auxiliary vector size must not exceed existing one (which is predefined as AT_VECTOR_SIZE and depends on architecture). 7) File descriptor passed in @exe_file should be pointing to executable file (because we use existing prctl_set_mm_exe_file_locked helper it ensures that the file we are going to use as exe link has all required permission granted). Now about where these members are involved inside kernel code: - @start_code and @end_code are used in /proc/$pid/[stat|statm] output; - @start_data and @end_data are used in /proc/$pid/[stat|statm] output, also they are considered if there enough space for brk() syscall result if RLIMIT_DATA is set; - @start_brk shown in /proc/$pid/stat output and accounted in brk() syscall if RLIMIT_DATA is set; also this member is tested to find a symbolic name of mmap event for perf system (we choose if event is generated for "heap" area); one more aplication is selinux -- we test if a process has PROCESS__EXECHEAP permission if trying to make heap area being executable with mprotect() syscall; - @brk is a current value for brk() syscall which lays inside heap area, it's shown in /proc/$pid/stat. When syscall brk() succesfully provides new memory area to a user space upon brk() completion the mm::brk is updated to carry new value; Both @start_brk and @brk are actively used in /proc/$pid/maps and /proc/$pid/smaps output to find a symbolic name "heap" for VMA being scanned; - @start_stack is printed out in /proc/$pid/stat and used to find a symbolic name "stack" for task and threads in /proc/$pid/maps and /proc/$pid/smaps output, and as the same as with @start_brk -- perf system uses it for event naming. Also kernel treat this member as a start address of where to map vDSO pages and to check if there is enough space for shmat() syscall; - @arg_start, @arg_end, @env_start and @env_end are printed out in /proc/$pid/stat. Another access to the data these members represent is to read /proc/$pid/environ or /proc/$pid/cmdline. Any attempt to read these areas kernel tests with access_process_vm helper so a user must have enough rights for this action; - @auxv and @auxv_size may be read from /proc/$pid/auxv. Strictly speaking kernel doesn't care much about which exactly data is sitting there because it is solely for userspace; - @exe_fd is referred from /proc/$pid/exe and when generating coredump. We uses prctl_set_mm_exe_file_locked helper to update this member, so exe-file link modification remains one-shot action. Still note that updating exe-file link now doesn't require sys-resource capability anymore, after all there is no much profit in preventing setup own file link (there are a number of ways to execute own code -- ptrace, ld-preload, so that the only reliable way to find which exactly code is executed is to inspect running program memory). Still we require the caller to be at least user-namespace root user. I believe the old interface should be deprecated and ripped off in a couple of kernel releases if no one against. To test if new interface is implemented in the kernel one can pass PR_SET_MM_MAP_SIZE opcode and the kernel returns the size of currently supported struct prctl_mm_map. [akpm@linux-foundation.org: fix 80-col wordwrap in macro definitions] Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Acked-by: Andrew Vagin Tested-by: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/prctl.h | 27 +++++++ kernel/sys.c | 190 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 216 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 58afc04c107e..513df75d0fc9 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -1,6 +1,8 @@ #ifndef _LINUX_PRCTL_H #define _LINUX_PRCTL_H +#include + /* Values to pass as first argument to prctl() */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ @@ -119,6 +121,31 @@ # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 # define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; /* * Set specific pid that is allowed to ptrace the current task. diff --git a/kernel/sys.c b/kernel/sys.c index 14222a1699c0..f7030b060018 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1687,6 +1687,187 @@ exit: return err; } +#ifdef CONFIG_CHECKPOINT_RESTORE +/* + * WARNING: we don't require any capability here so be very careful + * in what is allowed for modification from userspace. + */ +static int validate_prctl_map(struct prctl_mm_map *prctl_map) +{ + unsigned long mmap_max_addr = TASK_SIZE; + struct mm_struct *mm = current->mm; + int error = -EINVAL, i; + + static const unsigned char offsets[] = { + offsetof(struct prctl_mm_map, start_code), + offsetof(struct prctl_mm_map, end_code), + offsetof(struct prctl_mm_map, start_data), + offsetof(struct prctl_mm_map, end_data), + offsetof(struct prctl_mm_map, start_brk), + offsetof(struct prctl_mm_map, brk), + offsetof(struct prctl_mm_map, start_stack), + offsetof(struct prctl_mm_map, arg_start), + offsetof(struct prctl_mm_map, arg_end), + offsetof(struct prctl_mm_map, env_start), + offsetof(struct prctl_mm_map, env_end), + }; + + /* + * Make sure the members are not somewhere outside + * of allowed address space. + */ + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + u64 val = *(u64 *)((char *)prctl_map + offsets[i]); + + if ((unsigned long)val >= mmap_max_addr || + (unsigned long)val < mmap_min_addr) + goto out; + } + + /* + * Make sure the pairs are ordered. + */ +#define __prctl_check_order(__m1, __op, __m2) \ + ((unsigned long)prctl_map->__m1 __op \ + (unsigned long)prctl_map->__m2) ? 0 : -EINVAL + error = __prctl_check_order(start_code, <, end_code); + error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_brk, <=, brk); + error |= __prctl_check_order(arg_start, <=, arg_end); + error |= __prctl_check_order(env_start, <=, env_end); + if (error) + goto out; +#undef __prctl_check_order + + error = -EINVAL; + + /* + * @brk should be after @end_data in traditional maps. + */ + if (prctl_map->start_brk <= prctl_map->end_data || + prctl_map->brk <= prctl_map->end_data) + goto out; + + /* + * Neither we should allow to override limits if they set. + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, + prctl_map->start_brk, prctl_map->end_data, + prctl_map->start_data)) + goto out; + + /* + * Someone is trying to cheat the auxv vector. + */ + if (prctl_map->auxv_size) { + if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) + goto out; + } + + /* + * Finally, make sure the caller has the rights to + * change /proc/pid/exe link: only local root should + * be allowed to. + */ + if (prctl_map->exe_fd != (u32)-1) { + struct user_namespace *ns = current_user_ns(); + const struct cred *cred = current_cred(); + + if (!uid_eq(cred->uid, make_kuid(ns, 0)) || + !gid_eq(cred->gid, make_kgid(ns, 0))) + goto out; + } + + error = 0; +out: + return error; +} + +static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) +{ + struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; + unsigned long user_auxv[AT_VECTOR_SIZE]; + struct mm_struct *mm = current->mm; + int error; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); + + if (opt == PR_SET_MM_MAP_SIZE) + return put_user((unsigned int)sizeof(prctl_map), + (unsigned int __user *)addr); + + if (data_size != sizeof(prctl_map)) + return -EINVAL; + + if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) + return -EFAULT; + + error = validate_prctl_map(&prctl_map); + if (error) + return error; + + if (prctl_map.auxv_size) { + memset(user_auxv, 0, sizeof(user_auxv)); + if (copy_from_user(user_auxv, + (const void __user *)prctl_map.auxv, + prctl_map.auxv_size)) + return -EFAULT; + + /* Last entry must be AT_NULL as specification requires */ + user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; + user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; + } + + down_write(&mm->mmap_sem); + if (prctl_map.exe_fd != (u32)-1) + error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); + downgrade_write(&mm->mmap_sem); + if (error) + goto out; + + /* + * We don't validate if these members are pointing to + * real present VMAs because application may have correspond + * VMAs already unmapped and kernel uses these members for statistics + * output in procfs mostly, except + * + * - @start_brk/@brk which are used in do_brk but kernel lookups + * for VMAs when updating these memvers so anything wrong written + * here cause kernel to swear at userspace program but won't lead + * to any problem in kernel itself + */ + + mm->start_code = prctl_map.start_code; + mm->end_code = prctl_map.end_code; + mm->start_data = prctl_map.start_data; + mm->end_data = prctl_map.end_data; + mm->start_brk = prctl_map.start_brk; + mm->brk = prctl_map.brk; + mm->start_stack = prctl_map.start_stack; + mm->arg_start = prctl_map.arg_start; + mm->arg_end = prctl_map.arg_end; + mm->env_start = prctl_map.env_start; + mm->env_end = prctl_map.env_end; + + /* + * Note this update of @saved_auxv is lockless thus + * if someone reads this member in procfs while we're + * updating -- it may get partly updated results. It's + * known and acceptable trade off: we leave it as is to + * not introduce additional locks here making the kernel + * more complex. + */ + if (prctl_map.auxv_size) + memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); + + error = 0; +out: + up_read(&mm->mmap_sem); + return error; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { @@ -1694,9 +1875,16 @@ static int prctl_set_mm(int opt, unsigned long addr, struct vm_area_struct *vma; int error; - if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) + if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && + opt != PR_SET_MM_MAP && + opt != PR_SET_MM_MAP_SIZE))) return -EINVAL; +#ifdef CONFIG_CHECKPOINT_RESTORE + if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) + return prctl_set_mm_map(opt, (const void __user *)addr, arg4); +#endif + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; -- cgit v1.2.3 From 09316c09dde33aae14f34489d9e3d243ec0d5938 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 9 Oct 2014 15:29:32 -0700 Subject: mm/balloon_compaction: add vmstat counters and kpageflags bit Always mark pages with PageBalloon even if balloon compaction is disabled and expose this mark in /proc/kpageflags as KPF_BALLOON. Also this patch adds three counters into /proc/vmstat: "balloon_inflate", "balloon_deflate" and "balloon_migrate". They accumulate balloon activity. Current size of balloon is (balloon_inflate - balloon_deflate) pages. All generic balloon code now gathered under option CONFIG_MEMORY_BALLOON. It should be selected by ballooning driver which wants use this feature. Currently virtio-balloon is the only user. Signed-off-by: Konstantin Khlebnikov Cc: Rafael Aquini Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/virtio/Kconfig | 1 + drivers/virtio/virtio_balloon.c | 1 + fs/proc/page.c | 3 +++ include/linux/balloon_compaction.h | 2 ++ include/linux/vm_event_item.h | 7 +++++++ include/uapi/linux/kernel-page-flags.h | 1 + mm/Kconfig | 7 ++++++- mm/Makefile | 3 ++- mm/balloon_compaction.c | 2 ++ mm/vmstat.c | 12 +++++++++++- tools/vm/page-types.c | 1 + 11 files changed, 37 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index c6683f2e396c..00b228638274 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -25,6 +25,7 @@ config VIRTIO_PCI config VIRTIO_BALLOON tristate "Virtio balloon driver" depends on VIRTIO + select MEMORY_BALLOON ---help--- This driver supports increasing and decreasing the amount of memory within a KVM guest. diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 2bad7f9dd2ac..f893148a107b 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -396,6 +396,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, spin_lock_irqsave(&vb_dev_info->pages_lock, flags); balloon_page_insert(vb_dev_info, newpage); vb_dev_info->isolated_pages--; + __count_vm_event(BALLOON_MIGRATE); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb->pfns, newpage); diff --git a/fs/proc/page.c b/fs/proc/page.c index e647c55275d9..1e3187da1fed 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page) if (PageBuddy(page)) u |= 1 << KPF_BUDDY; + if (PageBalloon(page)) + u |= 1 << KPF_BALLOON; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index bc3d2985cc9a..9b0a15d06a4f 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -166,11 +166,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { + __SetPageBalloon(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { + __ClearPageBalloon(page); list_del(&page->lru); } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ced92345c963..730334cdf037 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -72,6 +72,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_MEMORY_BALLOON + BALLOON_INFLATE, + BALLOON_DEFLATE, +#ifdef CONFIG_BALLOON_COMPACTION + BALLOON_MIGRATE, +#endif +#endif #ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 5116a0e48172..2f96d233c980 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -31,6 +31,7 @@ #define KPF_KSM 21 #define KPF_THP 22 +#define KPF_BALLOON 23 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 0ceb8a567dab..1d1ae6b078fd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -230,12 +230,17 @@ config SPLIT_PTLOCK_CPUS config ARCH_ENABLE_SPLIT_PMD_PTLOCK boolean +# +# support for memory balloon +config MEMORY_BALLOON + boolean + # # support for memory balloon compaction config BALLOON_COMPACTION bool "Allow for balloon memory compaction/migration" def_bool y - depends on COMPACTION && VIRTIO_BALLOON + depends on COMPACTION && MEMORY_BALLOON help Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be diff --git a/mm/Makefile b/mm/Makefile index f8ed7ab417b1..1f534a7f0a71 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o vmacache.o \ + compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ iov_iter.o debug.o $(mmu-y) @@ -67,3 +67,4 @@ obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 3afdabdbc0a4..b3cbe19f71b5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -36,6 +36,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) BUG_ON(!trylock_page(page)); spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_insert(b_dev_info, page); + __count_vm_event(BALLOON_INFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); return page; @@ -74,6 +75,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) } spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_delete(page); + __count_vm_event(BALLOON_DEFLATE); spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); dequeued_page = true; diff --git a/mm/vmstat.c b/mm/vmstat.c index e9ab104b956f..cce7c766da7a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -735,7 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* Zoned VM counters */ + /* enum zone_stat_item countes */ "nr_free_pages", "nr_alloc_batch", "nr_inactive_anon", @@ -778,10 +778,13 @@ const char * const vmstat_text[] = { "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", + + /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS + /* enum vm_event_item counters */ "pgpgin", "pgpgout", "pswpin", @@ -860,6 +863,13 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_MEMORY_BALLOON + "balloon_inflate", + "balloon_deflate", +#ifdef CONFIG_BALLOON_COMPACTION + "balloon_migrate", +#endif +#endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index c4d6d2e20e0d..264fbc297e0b 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -132,6 +132,7 @@ static const char * const page_flag_names[] = { [KPF_NOPAGE] = "n:nopage", [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", + [KPF_BALLOON] = "o:balloon", [KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked", -- cgit v1.2.3 From 2cbbca5e7c38d0c776497f586688464f8cfb1583 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Sep 2014 16:02:19 +1000 Subject: md: discard PRINT_RAID_DEBUG ioctl All the interesting information printed by this ioctl is provided in /proc/mdstat and/or sysfs. So it isn't needed and isn't used and would be best if it didn't exist. Signed-off-by: NeilBrown --- drivers/md/md.c | 138 ----------------------------------------- include/uapi/linux/raid/md_u.h | 1 - 2 files changed, 139 deletions(-) (limited to 'include/uapi') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9c33f052c848..fdd2d3fe1c6c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -66,8 +66,6 @@ static void autostart_arrays(int part); static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); -static void md_print_devices(void); - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; @@ -2171,136 +2169,6 @@ static void export_array(struct mddev *mddev) mddev->major_version = 0; } -static void print_desc(mdp_disk_t *desc) -{ - printk(" DISK\n", desc->number, - desc->major,desc->minor,desc->raid_disk,desc->state); -} - -static void print_sb_90(mdp_super_t *sb) -{ - int i; - - printk(KERN_INFO - "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", - sb->major_version, sb->minor_version, sb->patch_version, - sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, - sb->ctime); - printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", - sb->level, sb->size, sb->nr_disks, sb->raid_disks, - sb->md_minor, sb->layout, sb->chunk_size); - printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" - " FD:%d SD:%d CSUM:%08x E:%08lx\n", - sb->utime, sb->state, sb->active_disks, sb->working_disks, - sb->failed_disks, sb->spare_disks, - sb->sb_csum, (unsigned long)sb->events_lo); - - printk(KERN_INFO); - for (i = 0; i < MD_SB_DISKS; i++) { - mdp_disk_t *desc; - - desc = sb->disks + i; - if (desc->number || desc->major || desc->minor || - desc->raid_disk || (desc->state && (desc->state != 4))) { - printk(" D %2d: ", i); - print_desc(desc); - } - } - printk(KERN_INFO "md: THIS: "); - print_desc(&sb->this_disk); -} - -static void print_sb_1(struct mdp_superblock_1 *sb) -{ - __u8 *uuid; - - uuid = sb->set_uuid; - printk(KERN_INFO - "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" - "md: Name: \"%s\" CT:%llu\n", - le32_to_cpu(sb->major_version), - le32_to_cpu(sb->feature_map), - uuid, - sb->set_name, - (unsigned long long)le64_to_cpu(sb->ctime) - & MD_SUPERBLOCK_1_TIME_SEC_MASK); - - uuid = sb->device_uuid; - printk(KERN_INFO - "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" - " RO:%llu\n" - "md: Dev:%08x UUID: %pU\n" - "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" - "md: (MaxDev:%u) \n", - le32_to_cpu(sb->level), - (unsigned long long)le64_to_cpu(sb->size), - le32_to_cpu(sb->raid_disks), - le32_to_cpu(sb->layout), - le32_to_cpu(sb->chunksize), - (unsigned long long)le64_to_cpu(sb->data_offset), - (unsigned long long)le64_to_cpu(sb->data_size), - (unsigned long long)le64_to_cpu(sb->super_offset), - (unsigned long long)le64_to_cpu(sb->recovery_offset), - le32_to_cpu(sb->dev_number), - uuid, - sb->devflags, - (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, - (unsigned long long)le64_to_cpu(sb->events), - (unsigned long long)le64_to_cpu(sb->resync_offset), - le32_to_cpu(sb->sb_csum), - le32_to_cpu(sb->max_dev) - ); -} - -static void print_rdev(struct md_rdev *rdev, int major_version) -{ - char b[BDEVNAME_SIZE]; - printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", - bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, - test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), - rdev->desc_nr); - if (rdev->sb_loaded) { - printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); - switch (major_version) { - case 0: - print_sb_90(page_address(rdev->sb_page)); - break; - case 1: - print_sb_1(page_address(rdev->sb_page)); - break; - } - } else - printk(KERN_INFO "md: no rdev superblock!\n"); -} - -static void md_print_devices(void) -{ - struct list_head *tmp; - struct md_rdev *rdev; - struct mddev *mddev; - char b[BDEVNAME_SIZE]; - - printk("\n"); - printk("md: **********************************\n"); - printk("md: * *\n"); - printk("md: **********************************\n"); - for_each_mddev(mddev, tmp) { - - if (mddev->bitmap) - bitmap_print_sb(mddev->bitmap); - else - printk("%s: ", mdname(mddev)); - rdev_for_each(rdev, mddev) - printk("<%s>", bdevname(rdev->bdev,b)); - printk("\n"); - - rdev_for_each(rdev, mddev) - print_rdev(rdev, mddev->major_version); - } - printk("md: **********************************\n"); - printk("\n"); -} - static void sync_sbs(struct mddev *mddev, int nospares) { /* Update each superblock (in-memory image), but @@ -6285,7 +6153,6 @@ static inline bool md_ioctl_valid(unsigned int cmd) case GET_DISK_INFO: case HOT_ADD_DISK: case HOT_REMOVE_DISK: - case PRINT_RAID_DEBUG: case RAID_AUTORUN: case RAID_VERSION: case RESTART_ARRAY_RW: @@ -6331,11 +6198,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = get_version(argp); goto out; - case PRINT_RAID_DEBUG: - err = 0; - md_print_devices(); - goto out; - #ifndef MODULE case RAID_AUTORUN: err = 0; diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 4133e744e4e6..74e7c60c4716 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -39,7 +39,6 @@ #define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) #define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) #define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) -#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) #define RAID_AUTORUN _IO (MD_MAJOR, 0x14) #define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t) -- cgit v1.2.3 From c15952dc18d8a293d976ac6c06d44d9d98023b45 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 14 Oct 2014 02:08:54 -0700 Subject: net: filter: move common defines into bpf_common.h userspace programs that use eBPF instruction macros need to include two files: uapi/linux/filter.h and uapi/linux/bpf.h Move common macro definitions that are shared between classic BPF and eBPF into uapi/linux/bpf_common.h, so that user app can include only one bpf.h file Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/bpf.h | 1 + include/uapi/linux/bpf_common.h | 55 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/filter.h | 56 +---------------------------------------- 4 files changed, 58 insertions(+), 55 deletions(-) create mode 100644 include/uapi/linux/bpf_common.h (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 70e150ebc6c9..1115d68b150f 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -68,6 +68,7 @@ header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h header-y += bpf.h +header-y += bpf_common.h header-y += bpqether.h header-y += bsg.h header-y += btrfs.h diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 31b0ac208a52..d18316f9e9c4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -8,6 +8,7 @@ #define _UAPI__LINUX_BPF_H__ #include +#include /* Extended instruction set based on top of classic BPF */ diff --git a/include/uapi/linux/bpf_common.h b/include/uapi/linux/bpf_common.h new file mode 100644 index 000000000000..a5c220e0828f --- /dev/null +++ b/include/uapi/linux/bpf_common.h @@ -0,0 +1,55 @@ +#ifndef _UAPI__LINUX_BPF_COMMON_H__ +#define _UAPI__LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 +#define BPF_H 0x08 +#define BPF_B 0x10 +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* _UAPI__LINUX_BPF_COMMON_H__ */ diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h index 253b4d42cf2b..47785d5ecf17 100644 --- a/include/uapi/linux/filter.h +++ b/include/uapi/linux/filter.h @@ -7,7 +7,7 @@ #include #include - +#include /* * Current version of the filter code architecture. @@ -32,56 +32,6 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ struct sock_filter __user *filter; }; -/* - * Instruction classes - */ - -#define BPF_CLASS(code) ((code) & 0x07) -#define BPF_LD 0x00 -#define BPF_LDX 0x01 -#define BPF_ST 0x02 -#define BPF_STX 0x03 -#define BPF_ALU 0x04 -#define BPF_JMP 0x05 -#define BPF_RET 0x06 -#define BPF_MISC 0x07 - -/* ld/ldx fields */ -#define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 -#define BPF_H 0x08 -#define BPF_B 0x10 -#define BPF_MODE(code) ((code) & 0xe0) -#define BPF_IMM 0x00 -#define BPF_ABS 0x20 -#define BPF_IND 0x40 -#define BPF_MEM 0x60 -#define BPF_LEN 0x80 -#define BPF_MSH 0xa0 - -/* alu/jmp fields */ -#define BPF_OP(code) ((code) & 0xf0) -#define BPF_ADD 0x00 -#define BPF_SUB 0x10 -#define BPF_MUL 0x20 -#define BPF_DIV 0x30 -#define BPF_OR 0x40 -#define BPF_AND 0x50 -#define BPF_LSH 0x60 -#define BPF_RSH 0x70 -#define BPF_NEG 0x80 -#define BPF_MOD 0x90 -#define BPF_XOR 0xa0 - -#define BPF_JA 0x00 -#define BPF_JEQ 0x10 -#define BPF_JGT 0x20 -#define BPF_JGE 0x30 -#define BPF_JSET 0x40 -#define BPF_SRC(code) ((code) & 0x08) -#define BPF_K 0x00 -#define BPF_X 0x08 - /* ret - BPF_K and BPF_X also apply */ #define BPF_RVAL(code) ((code) & 0x18) #define BPF_A 0x10 @@ -91,10 +41,6 @@ struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ #define BPF_TAX 0x00 #define BPF_TXA 0x80 -#ifndef BPF_MAXINSNS -#define BPF_MAXINSNS 4096 -#endif - /* * Macros for filter block array initializers. */ -- cgit v1.2.3 From 777783e0abae3cab7555bb182776f9ffaa35631a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 16 Oct 2014 14:40:38 +0200 Subject: staging: android: binder: move to the "real" part of the kernel The Android binder code has been "stable" for many years now. No matter what comes in the future, we are going to have to support this API, so might as well move it to the "real" part of the kernel as there's no real work that needs to be done to the existing code. Signed-off-by: Greg Kroah-Hartman --- drivers/Kconfig | 2 + drivers/Makefile | 1 + drivers/android/Kconfig | 37 + drivers/android/Makefile | 3 + drivers/android/binder.c | 3673 ++++++++++++++++++++++++++++++++ drivers/android/binder.h | 30 + drivers/android/binder_trace.h | 329 +++ drivers/staging/android/Kconfig | 30 - drivers/staging/android/Makefile | 1 - drivers/staging/android/binder.c | 3673 -------------------------------- drivers/staging/android/binder.h | 30 - drivers/staging/android/binder_trace.h | 329 --- drivers/staging/android/uapi/binder.h | 351 --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/android/Kbuild | 2 + include/uapi/linux/android/binder.h | 351 +++ 16 files changed, 4429 insertions(+), 4414 deletions(-) create mode 100644 drivers/android/Kconfig create mode 100644 drivers/android/Makefile create mode 100644 drivers/android/binder.c create mode 100644 drivers/android/binder.h create mode 100644 drivers/android/binder_trace.h delete mode 100644 drivers/staging/android/binder.c delete mode 100644 drivers/staging/android/binder.h delete mode 100644 drivers/staging/android/binder_trace.h delete mode 100644 drivers/staging/android/uapi/binder.h create mode 100644 include/uapi/linux/android/Kbuild create mode 100644 include/uapi/linux/android/binder.h (limited to 'include/uapi') diff --git a/drivers/Kconfig b/drivers/Kconfig index 1a693d3f9d51..569ff7886dc3 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -182,4 +182,6 @@ source "drivers/ras/Kconfig" source "drivers/thunderbolt/Kconfig" +source "drivers/android/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index ebee55537a05..60d19820a4d4 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -161,3 +161,4 @@ obj-$(CONFIG_POWERCAP) += powercap/ obj-$(CONFIG_MCB) += mcb/ obj-$(CONFIG_RAS) += ras/ obj-$(CONFIG_THUNDERBOLT) += thunderbolt/ +obj-$(CONFIG_ANDROID) += android/ diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig new file mode 100644 index 000000000000..bdfc6c6f4f5a --- /dev/null +++ b/drivers/android/Kconfig @@ -0,0 +1,37 @@ +menu "Android" + +config ANDROID + bool "Android Drivers" + ---help--- + Enable support for various drivers needed on the Android platform + +if ANDROID + +config ANDROID_BINDER_IPC + bool "Android Binder IPC Driver" + depends on MMU + default n + ---help--- + Binder is used in Android for both communication between processes, + and remote method invocation. + + This means one Android process can call a method/routine in another + Android process, using Binder to identify, invoke and pass arguments + between said processes. + +config ANDROID_BINDER_IPC_32BIT + bool + depends on !64BIT && ANDROID_BINDER_IPC + default y + ---help--- + The Binder API has been changed to support both 32 and 64bit + applications in a mixed environment. + + Enable this to support an old 32-bit Android user-space (v4.4 and + earlier). + + Note that enabling this will break newer Android user-space. + +endif # if ANDROID + +endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile new file mode 100644 index 000000000000..3b7e4b072c58 --- /dev/null +++ b/drivers/android/Makefile @@ -0,0 +1,3 @@ +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o diff --git a/drivers/android/binder.c b/drivers/android/binder.c new file mode 100644 index 000000000000..c69c40d69d5c --- /dev/null +++ b/drivers/android/binder.c @@ -0,0 +1,3673 @@ +/* binder.c + * + * Android IPC Subsystem + * + * Copyright (C) 2007-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "binder.h" +#include "binder_trace.h" + +static DEFINE_MUTEX(binder_main_lock); +static DEFINE_MUTEX(binder_deferred_lock); +static DEFINE_MUTEX(binder_mmap_lock); + +static HLIST_HEAD(binder_procs); +static HLIST_HEAD(binder_deferred_list); +static HLIST_HEAD(binder_dead_nodes); + +static struct dentry *binder_debugfs_dir_entry_root; +static struct dentry *binder_debugfs_dir_entry_proc; +static struct binder_node *binder_context_mgr_node; +static kuid_t binder_context_mgr_uid = INVALID_UID; +static int binder_last_id; +static struct workqueue_struct *binder_deferred_workqueue; + +#define BINDER_DEBUG_ENTRY(name) \ +static int binder_##name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, binder_##name##_show, inode->i_private); \ +} \ +\ +static const struct file_operations binder_##name##_fops = { \ + .owner = THIS_MODULE, \ + .open = binder_##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +static int binder_proc_show(struct seq_file *m, void *unused); +BINDER_DEBUG_ENTRY(proc); + +/* This is only defined in include/asm-arm/sizes.h */ +#ifndef SZ_1K +#define SZ_1K 0x400 +#endif + +#ifndef SZ_4M +#define SZ_4M 0x400000 +#endif + +#define FORBIDDEN_MMAP_FLAGS (VM_WRITE) + +#define BINDER_SMALL_BUF_SIZE (PAGE_SIZE * 64) + +enum { + BINDER_DEBUG_USER_ERROR = 1U << 0, + BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, + BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, + BINDER_DEBUG_OPEN_CLOSE = 1U << 3, + BINDER_DEBUG_DEAD_BINDER = 1U << 4, + BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, + BINDER_DEBUG_READ_WRITE = 1U << 6, + BINDER_DEBUG_USER_REFS = 1U << 7, + BINDER_DEBUG_THREADS = 1U << 8, + BINDER_DEBUG_TRANSACTION = 1U << 9, + BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, + BINDER_DEBUG_FREE_BUFFER = 1U << 11, + BINDER_DEBUG_INTERNAL_REFS = 1U << 12, + BINDER_DEBUG_BUFFER_ALLOC = 1U << 13, + BINDER_DEBUG_PRIORITY_CAP = 1U << 14, + BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 15, +}; +static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | + BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; +module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); + +static bool binder_debug_no_lock; +module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO); + +static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); +static int binder_stop_on_user_error; + +static int binder_set_stop_on_user_error(const char *val, + struct kernel_param *kp) +{ + int ret; + + ret = param_set_int(val, kp); + if (binder_stop_on_user_error < 2) + wake_up(&binder_user_error_wait); + return ret; +} +module_param_call(stop_on_user_error, binder_set_stop_on_user_error, + param_get_int, &binder_stop_on_user_error, S_IWUSR | S_IRUGO); + +#define binder_debug(mask, x...) \ + do { \ + if (binder_debug_mask & mask) \ + pr_info(x); \ + } while (0) + +#define binder_user_error(x...) \ + do { \ + if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \ + pr_info(x); \ + if (binder_stop_on_user_error) \ + binder_stop_on_user_error = 2; \ + } while (0) + +enum binder_stat_types { + BINDER_STAT_PROC, + BINDER_STAT_THREAD, + BINDER_STAT_NODE, + BINDER_STAT_REF, + BINDER_STAT_DEATH, + BINDER_STAT_TRANSACTION, + BINDER_STAT_TRANSACTION_COMPLETE, + BINDER_STAT_COUNT +}; + +struct binder_stats { + int br[_IOC_NR(BR_FAILED_REPLY) + 1]; + int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1]; + int obj_created[BINDER_STAT_COUNT]; + int obj_deleted[BINDER_STAT_COUNT]; +}; + +static struct binder_stats binder_stats; + +static inline void binder_stats_deleted(enum binder_stat_types type) +{ + binder_stats.obj_deleted[type]++; +} + +static inline void binder_stats_created(enum binder_stat_types type) +{ + binder_stats.obj_created[type]++; +} + +struct binder_transaction_log_entry { + int debug_id; + int call_type; + int from_proc; + int from_thread; + int target_handle; + int to_proc; + int to_thread; + int to_node; + int data_size; + int offsets_size; +}; +struct binder_transaction_log { + int next; + int full; + struct binder_transaction_log_entry entry[32]; +}; +static struct binder_transaction_log binder_transaction_log; +static struct binder_transaction_log binder_transaction_log_failed; + +static struct binder_transaction_log_entry *binder_transaction_log_add( + struct binder_transaction_log *log) +{ + struct binder_transaction_log_entry *e; + + e = &log->entry[log->next]; + memset(e, 0, sizeof(*e)); + log->next++; + if (log->next == ARRAY_SIZE(log->entry)) { + log->next = 0; + log->full = 1; + } + return e; +} + +struct binder_work { + struct list_head entry; + enum { + BINDER_WORK_TRANSACTION = 1, + BINDER_WORK_TRANSACTION_COMPLETE, + BINDER_WORK_NODE, + BINDER_WORK_DEAD_BINDER, + BINDER_WORK_DEAD_BINDER_AND_CLEAR, + BINDER_WORK_CLEAR_DEATH_NOTIFICATION, + } type; +}; + +struct binder_node { + int debug_id; + struct binder_work work; + union { + struct rb_node rb_node; + struct hlist_node dead_node; + }; + struct binder_proc *proc; + struct hlist_head refs; + int internal_strong_refs; + int local_weak_refs; + int local_strong_refs; + binder_uintptr_t ptr; + binder_uintptr_t cookie; + unsigned has_strong_ref:1; + unsigned pending_strong_ref:1; + unsigned has_weak_ref:1; + unsigned pending_weak_ref:1; + unsigned has_async_transaction:1; + unsigned accept_fds:1; + unsigned min_priority:8; + struct list_head async_todo; +}; + +struct binder_ref_death { + struct binder_work work; + binder_uintptr_t cookie; +}; + +struct binder_ref { + /* Lookups needed: */ + /* node + proc => ref (transaction) */ + /* desc + proc => ref (transaction, inc/dec ref) */ + /* node => refs + procs (proc exit) */ + int debug_id; + struct rb_node rb_node_desc; + struct rb_node rb_node_node; + struct hlist_node node_entry; + struct binder_proc *proc; + struct binder_node *node; + uint32_t desc; + int strong; + int weak; + struct binder_ref_death *death; +}; + +struct binder_buffer { + struct list_head entry; /* free and allocated entries by address */ + struct rb_node rb_node; /* free entry by size or allocated entry */ + /* by address */ + unsigned free:1; + unsigned allow_user_free:1; + unsigned async_transaction:1; + unsigned debug_id:29; + + struct binder_transaction *transaction; + + struct binder_node *target_node; + size_t data_size; + size_t offsets_size; + uint8_t data[0]; +}; + +enum binder_deferred_state { + BINDER_DEFERRED_PUT_FILES = 0x01, + BINDER_DEFERRED_FLUSH = 0x02, + BINDER_DEFERRED_RELEASE = 0x04, +}; + +struct binder_proc { + struct hlist_node proc_node; + struct rb_root threads; + struct rb_root nodes; + struct rb_root refs_by_desc; + struct rb_root refs_by_node; + int pid; + struct vm_area_struct *vma; + struct mm_struct *vma_vm_mm; + struct task_struct *tsk; + struct files_struct *files; + struct hlist_node deferred_work_node; + int deferred_work; + void *buffer; + ptrdiff_t user_buffer_offset; + + struct list_head buffers; + struct rb_root free_buffers; + struct rb_root allocated_buffers; + size_t free_async_space; + + struct page **pages; + size_t buffer_size; + uint32_t buffer_free; + struct list_head todo; + wait_queue_head_t wait; + struct binder_stats stats; + struct list_head delivered_death; + int max_threads; + int requested_threads; + int requested_threads_started; + int ready_threads; + long default_priority; + struct dentry *debugfs_entry; +}; + +enum { + BINDER_LOOPER_STATE_REGISTERED = 0x01, + BINDER_LOOPER_STATE_ENTERED = 0x02, + BINDER_LOOPER_STATE_EXITED = 0x04, + BINDER_LOOPER_STATE_INVALID = 0x08, + BINDER_LOOPER_STATE_WAITING = 0x10, + BINDER_LOOPER_STATE_NEED_RETURN = 0x20 +}; + +struct binder_thread { + struct binder_proc *proc; + struct rb_node rb_node; + int pid; + int looper; + struct binder_transaction *transaction_stack; + struct list_head todo; + uint32_t return_error; /* Write failed, return error code in read buf */ + uint32_t return_error2; /* Write failed, return error code in read */ + /* buffer. Used when sending a reply to a dead process that */ + /* we are also waiting on */ + wait_queue_head_t wait; + struct binder_stats stats; +}; + +struct binder_transaction { + int debug_id; + struct binder_work work; + struct binder_thread *from; + struct binder_transaction *from_parent; + struct binder_proc *to_proc; + struct binder_thread *to_thread; + struct binder_transaction *to_parent; + unsigned need_reply:1; + /* unsigned is_dead:1; */ /* not used at the moment */ + + struct binder_buffer *buffer; + unsigned int code; + unsigned int flags; + long priority; + long saved_priority; + kuid_t sender_euid; +}; + +static void +binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); + +static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) +{ + struct files_struct *files = proc->files; + unsigned long rlim_cur; + unsigned long irqs; + + if (files == NULL) + return -ESRCH; + + if (!lock_task_sighand(proc->tsk, &irqs)) + return -EMFILE; + + rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); + unlock_task_sighand(proc->tsk, &irqs); + + return __alloc_fd(files, 0, rlim_cur, flags); +} + +/* + * copied from fd_install + */ +static void task_fd_install( + struct binder_proc *proc, unsigned int fd, struct file *file) +{ + if (proc->files) + __fd_install(proc->files, fd, file); +} + +/* + * copied from sys_close + */ +static long task_close_fd(struct binder_proc *proc, unsigned int fd) +{ + int retval; + + if (proc->files == NULL) + return -ESRCH; + + retval = __close_fd(proc->files, fd); + /* can't restart close syscall because file table entry was cleared */ + if (unlikely(retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK)) + retval = -EINTR; + + return retval; +} + +static inline void binder_lock(const char *tag) +{ + trace_binder_lock(tag); + mutex_lock(&binder_main_lock); + trace_binder_locked(tag); +} + +static inline void binder_unlock(const char *tag) +{ + trace_binder_unlock(tag); + mutex_unlock(&binder_main_lock); +} + +static void binder_set_nice(long nice) +{ + long min_nice; + + if (can_nice(current, nice)) { + set_user_nice(current, nice); + return; + } + min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); + binder_debug(BINDER_DEBUG_PRIORITY_CAP, + "%d: nice value %ld not allowed use %ld instead\n", + current->pid, nice, min_nice); + set_user_nice(current, min_nice); + if (min_nice <= MAX_NICE) + return; + binder_user_error("%d RLIMIT_NICE not set\n", current->pid); +} + +static size_t binder_buffer_size(struct binder_proc *proc, + struct binder_buffer *buffer) +{ + if (list_is_last(&buffer->entry, &proc->buffers)) + return proc->buffer + proc->buffer_size - (void *)buffer->data; + return (size_t)list_entry(buffer->entry.next, + struct binder_buffer, entry) - (size_t)buffer->data; +} + +static void binder_insert_free_buffer(struct binder_proc *proc, + struct binder_buffer *new_buffer) +{ + struct rb_node **p = &proc->free_buffers.rb_node; + struct rb_node *parent = NULL; + struct binder_buffer *buffer; + size_t buffer_size; + size_t new_buffer_size; + + BUG_ON(!new_buffer->free); + + new_buffer_size = binder_buffer_size(proc, new_buffer); + + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: add free buffer, size %zd, at %p\n", + proc->pid, new_buffer_size, new_buffer); + + while (*p) { + parent = *p; + buffer = rb_entry(parent, struct binder_buffer, rb_node); + BUG_ON(!buffer->free); + + buffer_size = binder_buffer_size(proc, buffer); + + if (new_buffer_size < buffer_size) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&new_buffer->rb_node, parent, p); + rb_insert_color(&new_buffer->rb_node, &proc->free_buffers); +} + +static void binder_insert_allocated_buffer(struct binder_proc *proc, + struct binder_buffer *new_buffer) +{ + struct rb_node **p = &proc->allocated_buffers.rb_node; + struct rb_node *parent = NULL; + struct binder_buffer *buffer; + + BUG_ON(new_buffer->free); + + while (*p) { + parent = *p; + buffer = rb_entry(parent, struct binder_buffer, rb_node); + BUG_ON(buffer->free); + + if (new_buffer < buffer) + p = &parent->rb_left; + else if (new_buffer > buffer) + p = &parent->rb_right; + else + BUG(); + } + rb_link_node(&new_buffer->rb_node, parent, p); + rb_insert_color(&new_buffer->rb_node, &proc->allocated_buffers); +} + +static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc, + uintptr_t user_ptr) +{ + struct rb_node *n = proc->allocated_buffers.rb_node; + struct binder_buffer *buffer; + struct binder_buffer *kern_ptr; + + kern_ptr = (struct binder_buffer *)(user_ptr - proc->user_buffer_offset + - offsetof(struct binder_buffer, data)); + + while (n) { + buffer = rb_entry(n, struct binder_buffer, rb_node); + BUG_ON(buffer->free); + + if (kern_ptr < buffer) + n = n->rb_left; + else if (kern_ptr > buffer) + n = n->rb_right; + else + return buffer; + } + return NULL; +} + +static int binder_update_page_range(struct binder_proc *proc, int allocate, + void *start, void *end, + struct vm_area_struct *vma) +{ + void *page_addr; + unsigned long user_page_addr; + struct vm_struct tmp_area; + struct page **page; + struct mm_struct *mm; + + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: %s pages %p-%p\n", proc->pid, + allocate ? "allocate" : "free", start, end); + + if (end <= start) + return 0; + + trace_binder_update_page_range(proc, allocate, start, end); + + if (vma) + mm = NULL; + else + mm = get_task_mm(proc->tsk); + + if (mm) { + down_write(&mm->mmap_sem); + vma = proc->vma; + if (vma && mm != proc->vma_vm_mm) { + pr_err("%d: vma mm and task mm mismatch\n", + proc->pid); + vma = NULL; + } + } + + if (allocate == 0) + goto free_range; + + if (vma == NULL) { + pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", + proc->pid); + goto err_no_vma; + } + + for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { + int ret; + + page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; + + BUG_ON(*page); + *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + if (*page == NULL) { + pr_err("%d: binder_alloc_buf failed for page at %p\n", + proc->pid, page_addr); + goto err_alloc_page_failed; + } + tmp_area.addr = page_addr; + tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; + ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); + if (ret) { + pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", + proc->pid, page_addr); + goto err_map_kernel_failed; + } + user_page_addr = + (uintptr_t)page_addr + proc->user_buffer_offset; + ret = vm_insert_page(vma, user_page_addr, page[0]); + if (ret) { + pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", + proc->pid, user_page_addr); + goto err_vm_insert_page_failed; + } + /* vm_insert_page does not seem to increment the refcount */ + } + if (mm) { + up_write(&mm->mmap_sem); + mmput(mm); + } + return 0; + +free_range: + for (page_addr = end - PAGE_SIZE; page_addr >= start; + page_addr -= PAGE_SIZE) { + page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; + if (vma) + zap_page_range(vma, (uintptr_t)page_addr + + proc->user_buffer_offset, PAGE_SIZE, NULL); +err_vm_insert_page_failed: + unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); +err_map_kernel_failed: + __free_page(*page); + *page = NULL; +err_alloc_page_failed: + ; + } +err_no_vma: + if (mm) { + up_write(&mm->mmap_sem); + mmput(mm); + } + return -ENOMEM; +} + +static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, + size_t data_size, + size_t offsets_size, int is_async) +{ + struct rb_node *n = proc->free_buffers.rb_node; + struct binder_buffer *buffer; + size_t buffer_size; + struct rb_node *best_fit = NULL; + void *has_page_addr; + void *end_page_addr; + size_t size; + + if (proc->vma == NULL) { + pr_err("%d: binder_alloc_buf, no vma\n", + proc->pid); + return NULL; + } + + size = ALIGN(data_size, sizeof(void *)) + + ALIGN(offsets_size, sizeof(void *)); + + if (size < data_size || size < offsets_size) { + binder_user_error("%d: got transaction with invalid size %zd-%zd\n", + proc->pid, data_size, offsets_size); + return NULL; + } + + if (is_async && + proc->free_async_space < size + sizeof(struct binder_buffer)) { + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd failed, no async space left\n", + proc->pid, size); + return NULL; + } + + while (n) { + buffer = rb_entry(n, struct binder_buffer, rb_node); + BUG_ON(!buffer->free); + buffer_size = binder_buffer_size(proc, buffer); + + if (size < buffer_size) { + best_fit = n; + n = n->rb_left; + } else if (size > buffer_size) + n = n->rb_right; + else { + best_fit = n; + break; + } + } + if (best_fit == NULL) { + pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", + proc->pid, size); + return NULL; + } + if (n == NULL) { + buffer = rb_entry(best_fit, struct binder_buffer, rb_node); + buffer_size = binder_buffer_size(proc, buffer); + } + + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got buffer %p size %zd\n", + proc->pid, size, buffer, buffer_size); + + has_page_addr = + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); + if (n == NULL) { + if (size + sizeof(struct binder_buffer) + 4 >= buffer_size) + buffer_size = size; /* no room for other buffers */ + else + buffer_size = size + sizeof(struct binder_buffer); + } + end_page_addr = + (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); + if (end_page_addr > has_page_addr) + end_page_addr = has_page_addr; + if (binder_update_page_range(proc, 1, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) + return NULL; + + rb_erase(best_fit, &proc->free_buffers); + buffer->free = 0; + binder_insert_allocated_buffer(proc, buffer); + if (buffer_size != size) { + struct binder_buffer *new_buffer = (void *)buffer->data + size; + + list_add(&new_buffer->entry, &buffer->entry); + new_buffer->free = 1; + binder_insert_free_buffer(proc, new_buffer); + } + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_alloc_buf size %zd got %p\n", + proc->pid, size, buffer); + buffer->data_size = data_size; + buffer->offsets_size = offsets_size; + buffer->async_transaction = is_async; + if (is_async) { + proc->free_async_space -= size + sizeof(struct binder_buffer); + binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, + "%d: binder_alloc_buf size %zd async free %zd\n", + proc->pid, size, proc->free_async_space); + } + + return buffer; +} + +static void *buffer_start_page(struct binder_buffer *buffer) +{ + return (void *)((uintptr_t)buffer & PAGE_MASK); +} + +static void *buffer_end_page(struct binder_buffer *buffer) +{ + return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); +} + +static void binder_delete_free_buffer(struct binder_proc *proc, + struct binder_buffer *buffer) +{ + struct binder_buffer *prev, *next = NULL; + int free_page_end = 1; + int free_page_start = 1; + + BUG_ON(proc->buffers.next == &buffer->entry); + prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); + BUG_ON(!prev->free); + if (buffer_end_page(prev) == buffer_start_page(buffer)) { + free_page_start = 0; + if (buffer_end_page(prev) == buffer_end_page(buffer)) + free_page_end = 0; + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %p share page with %p\n", + proc->pid, buffer, prev); + } + + if (!list_is_last(&buffer->entry, &proc->buffers)) { + next = list_entry(buffer->entry.next, + struct binder_buffer, entry); + if (buffer_start_page(next) == buffer_end_page(buffer)) { + free_page_end = 0; + if (buffer_start_page(next) == + buffer_start_page(buffer)) + free_page_start = 0; + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %p share page with %p\n", + proc->pid, buffer, prev); + } + } + list_del(&buffer->entry); + if (free_page_start || free_page_end) { + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %p do not share page%s%s with %p or %p\n", + proc->pid, buffer, free_page_start ? "" : " end", + free_page_end ? "" : " start", prev, next); + binder_update_page_range(proc, 0, free_page_start ? + buffer_start_page(buffer) : buffer_end_page(buffer), + (free_page_end ? buffer_end_page(buffer) : + buffer_start_page(buffer)) + PAGE_SIZE, NULL); + } +} + +static void binder_free_buf(struct binder_proc *proc, + struct binder_buffer *buffer) +{ + size_t size, buffer_size; + + buffer_size = binder_buffer_size(proc, buffer); + + size = ALIGN(buffer->data_size, sizeof(void *)) + + ALIGN(buffer->offsets_size, sizeof(void *)); + + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: binder_free_buf %p size %zd buffer_size %zd\n", + proc->pid, buffer, size, buffer_size); + + BUG_ON(buffer->free); + BUG_ON(size > buffer_size); + BUG_ON(buffer->transaction != NULL); + BUG_ON((void *)buffer < proc->buffer); + BUG_ON((void *)buffer > proc->buffer + proc->buffer_size); + + if (buffer->async_transaction) { + proc->free_async_space += size + sizeof(struct binder_buffer); + + binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, + "%d: binder_free_buf size %zd async free %zd\n", + proc->pid, size, proc->free_async_space); + } + + binder_update_page_range(proc, 0, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), + NULL); + rb_erase(&buffer->rb_node, &proc->allocated_buffers); + buffer->free = 1; + if (!list_is_last(&buffer->entry, &proc->buffers)) { + struct binder_buffer *next = list_entry(buffer->entry.next, + struct binder_buffer, entry); + + if (next->free) { + rb_erase(&next->rb_node, &proc->free_buffers); + binder_delete_free_buffer(proc, next); + } + } + if (proc->buffers.next != &buffer->entry) { + struct binder_buffer *prev = list_entry(buffer->entry.prev, + struct binder_buffer, entry); + + if (prev->free) { + binder_delete_free_buffer(proc, buffer); + rb_erase(&prev->rb_node, &proc->free_buffers); + buffer = prev; + } + } + binder_insert_free_buffer(proc, buffer); +} + +static struct binder_node *binder_get_node(struct binder_proc *proc, + binder_uintptr_t ptr) +{ + struct rb_node *n = proc->nodes.rb_node; + struct binder_node *node; + + while (n) { + node = rb_entry(n, struct binder_node, rb_node); + + if (ptr < node->ptr) + n = n->rb_left; + else if (ptr > node->ptr) + n = n->rb_right; + else + return node; + } + return NULL; +} + +static struct binder_node *binder_new_node(struct binder_proc *proc, + binder_uintptr_t ptr, + binder_uintptr_t cookie) +{ + struct rb_node **p = &proc->nodes.rb_node; + struct rb_node *parent = NULL; + struct binder_node *node; + + while (*p) { + parent = *p; + node = rb_entry(parent, struct binder_node, rb_node); + + if (ptr < node->ptr) + p = &(*p)->rb_left; + else if (ptr > node->ptr) + p = &(*p)->rb_right; + else + return NULL; + } + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (node == NULL) + return NULL; + binder_stats_created(BINDER_STAT_NODE); + rb_link_node(&node->rb_node, parent, p); + rb_insert_color(&node->rb_node, &proc->nodes); + node->debug_id = ++binder_last_id; + node->proc = proc; + node->ptr = ptr; + node->cookie = cookie; + node->work.type = BINDER_WORK_NODE; + INIT_LIST_HEAD(&node->work.entry); + INIT_LIST_HEAD(&node->async_todo); + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d:%d node %d u%016llx c%016llx created\n", + proc->pid, current->pid, node->debug_id, + (u64)node->ptr, (u64)node->cookie); + return node; +} + +static int binder_inc_node(struct binder_node *node, int strong, int internal, + struct list_head *target_list) +{ + if (strong) { + if (internal) { + if (target_list == NULL && + node->internal_strong_refs == 0 && + !(node == binder_context_mgr_node && + node->has_strong_ref)) { + pr_err("invalid inc strong node for %d\n", + node->debug_id); + return -EINVAL; + } + node->internal_strong_refs++; + } else + node->local_strong_refs++; + if (!node->has_strong_ref && target_list) { + list_del_init(&node->work.entry); + list_add_tail(&node->work.entry, target_list); + } + } else { + if (!internal) + node->local_weak_refs++; + if (!node->has_weak_ref && list_empty(&node->work.entry)) { + if (target_list == NULL) { + pr_err("invalid inc weak node for %d\n", + node->debug_id); + return -EINVAL; + } + list_add_tail(&node->work.entry, target_list); + } + } + return 0; +} + +static int binder_dec_node(struct binder_node *node, int strong, int internal) +{ + if (strong) { + if (internal) + node->internal_strong_refs--; + else + node->local_strong_refs--; + if (node->local_strong_refs || node->internal_strong_refs) + return 0; + } else { + if (!internal) + node->local_weak_refs--; + if (node->local_weak_refs || !hlist_empty(&node->refs)) + return 0; + } + if (node->proc && (node->has_strong_ref || node->has_weak_ref)) { + if (list_empty(&node->work.entry)) { + list_add_tail(&node->work.entry, &node->proc->todo); + wake_up_interruptible(&node->proc->wait); + } + } else { + if (hlist_empty(&node->refs) && !node->local_strong_refs && + !node->local_weak_refs) { + list_del_init(&node->work.entry); + if (node->proc) { + rb_erase(&node->rb_node, &node->proc->nodes); + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "refless node %d deleted\n", + node->debug_id); + } else { + hlist_del(&node->dead_node); + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "dead node %d deleted\n", + node->debug_id); + } + kfree(node); + binder_stats_deleted(BINDER_STAT_NODE); + } + } + + return 0; +} + + +static struct binder_ref *binder_get_ref(struct binder_proc *proc, + uint32_t desc) +{ + struct rb_node *n = proc->refs_by_desc.rb_node; + struct binder_ref *ref; + + while (n) { + ref = rb_entry(n, struct binder_ref, rb_node_desc); + + if (desc < ref->desc) + n = n->rb_left; + else if (desc > ref->desc) + n = n->rb_right; + else + return ref; + } + return NULL; +} + +static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, + struct binder_node *node) +{ + struct rb_node *n; + struct rb_node **p = &proc->refs_by_node.rb_node; + struct rb_node *parent = NULL; + struct binder_ref *ref, *new_ref; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct binder_ref, rb_node_node); + + if (node < ref->node) + p = &(*p)->rb_left; + else if (node > ref->node) + p = &(*p)->rb_right; + else + return ref; + } + new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (new_ref == NULL) + return NULL; + binder_stats_created(BINDER_STAT_REF); + new_ref->debug_id = ++binder_last_id; + new_ref->proc = proc; + new_ref->node = node; + rb_link_node(&new_ref->rb_node_node, parent, p); + rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); + + new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1; + for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { + ref = rb_entry(n, struct binder_ref, rb_node_desc); + if (ref->desc > new_ref->desc) + break; + new_ref->desc = ref->desc + 1; + } + + p = &proc->refs_by_desc.rb_node; + while (*p) { + parent = *p; + ref = rb_entry(parent, struct binder_ref, rb_node_desc); + + if (new_ref->desc < ref->desc) + p = &(*p)->rb_left; + else if (new_ref->desc > ref->desc) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new_ref->rb_node_desc, parent, p); + rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); + if (node) { + hlist_add_head(&new_ref->node_entry, &node->refs); + + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d new ref %d desc %d for node %d\n", + proc->pid, new_ref->debug_id, new_ref->desc, + node->debug_id); + } else { + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d new ref %d desc %d for dead node\n", + proc->pid, new_ref->debug_id, new_ref->desc); + } + return new_ref; +} + +static void binder_delete_ref(struct binder_ref *ref) +{ + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d delete ref %d desc %d for node %d\n", + ref->proc->pid, ref->debug_id, ref->desc, + ref->node->debug_id); + + rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); + rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); + if (ref->strong) + binder_dec_node(ref->node, 1, 1); + hlist_del(&ref->node_entry); + binder_dec_node(ref->node, 0, 1); + if (ref->death) { + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "%d delete ref %d desc %d has death notification\n", + ref->proc->pid, ref->debug_id, ref->desc); + list_del(&ref->death->work.entry); + kfree(ref->death); + binder_stats_deleted(BINDER_STAT_DEATH); + } + kfree(ref); + binder_stats_deleted(BINDER_STAT_REF); +} + +static int binder_inc_ref(struct binder_ref *ref, int strong, + struct list_head *target_list) +{ + int ret; + + if (strong) { + if (ref->strong == 0) { + ret = binder_inc_node(ref->node, 1, 1, target_list); + if (ret) + return ret; + } + ref->strong++; + } else { + if (ref->weak == 0) { + ret = binder_inc_node(ref->node, 0, 1, target_list); + if (ret) + return ret; + } + ref->weak++; + } + return 0; +} + + +static int binder_dec_ref(struct binder_ref *ref, int strong) +{ + if (strong) { + if (ref->strong == 0) { + binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", + ref->proc->pid, ref->debug_id, + ref->desc, ref->strong, ref->weak); + return -EINVAL; + } + ref->strong--; + if (ref->strong == 0) { + int ret; + + ret = binder_dec_node(ref->node, strong, 1); + if (ret) + return ret; + } + } else { + if (ref->weak == 0) { + binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", + ref->proc->pid, ref->debug_id, + ref->desc, ref->strong, ref->weak); + return -EINVAL; + } + ref->weak--; + } + if (ref->strong == 0 && ref->weak == 0) + binder_delete_ref(ref); + return 0; +} + +static void binder_pop_transaction(struct binder_thread *target_thread, + struct binder_transaction *t) +{ + if (target_thread) { + BUG_ON(target_thread->transaction_stack != t); + BUG_ON(target_thread->transaction_stack->from != target_thread); + target_thread->transaction_stack = + target_thread->transaction_stack->from_parent; + t->from = NULL; + } + t->need_reply = 0; + if (t->buffer) + t->buffer->transaction = NULL; + kfree(t); + binder_stats_deleted(BINDER_STAT_TRANSACTION); +} + +static void binder_send_failed_reply(struct binder_transaction *t, + uint32_t error_code) +{ + struct binder_thread *target_thread; + struct binder_transaction *next; + + BUG_ON(t->flags & TF_ONE_WAY); + while (1) { + target_thread = t->from; + if (target_thread) { + if (target_thread->return_error != BR_OK && + target_thread->return_error2 == BR_OK) { + target_thread->return_error2 = + target_thread->return_error; + target_thread->return_error = BR_OK; + } + if (target_thread->return_error == BR_OK) { + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "send failed reply for transaction %d to %d:%d\n", + t->debug_id, + target_thread->proc->pid, + target_thread->pid); + + binder_pop_transaction(target_thread, t); + target_thread->return_error = error_code; + wake_up_interruptible(&target_thread->wait); + } else { + pr_err("reply failed, target thread, %d:%d, has error code %d already\n", + target_thread->proc->pid, + target_thread->pid, + target_thread->return_error); + } + return; + } + next = t->from_parent; + + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "send failed reply for transaction %d, target dead\n", + t->debug_id); + + binder_pop_transaction(target_thread, t); + if (next == NULL) { + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "reply failed, no target thread at root\n"); + return; + } + t = next; + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "reply failed, no target thread -- retry %d\n", + t->debug_id); + } +} + +static void binder_transaction_buffer_release(struct binder_proc *proc, + struct binder_buffer *buffer, + binder_size_t *failed_at) +{ + binder_size_t *offp, *off_end; + int debug_id = buffer->debug_id; + + binder_debug(BINDER_DEBUG_TRANSACTION, + "%d buffer release %d, size %zd-%zd, failed at %p\n", + proc->pid, buffer->debug_id, + buffer->data_size, buffer->offsets_size, failed_at); + + if (buffer->target_node) + binder_dec_node(buffer->target_node, 1, 0); + + offp = (binder_size_t *)(buffer->data + + ALIGN(buffer->data_size, sizeof(void *))); + if (failed_at) + off_end = failed_at; + else + off_end = (void *)offp + buffer->offsets_size; + for (; offp < off_end; offp++) { + struct flat_binder_object *fp; + + if (*offp > buffer->data_size - sizeof(*fp) || + buffer->data_size < sizeof(*fp) || + !IS_ALIGNED(*offp, sizeof(u32))) { + pr_err("transaction release %d bad offset %lld, size %zd\n", + debug_id, (u64)*offp, buffer->data_size); + continue; + } + fp = (struct flat_binder_object *)(buffer->data + *offp); + switch (fp->type) { + case BINDER_TYPE_BINDER: + case BINDER_TYPE_WEAK_BINDER: { + struct binder_node *node = binder_get_node(proc, fp->binder); + + if (node == NULL) { + pr_err("transaction release %d bad node %016llx\n", + debug_id, (u64)fp->binder); + break; + } + binder_debug(BINDER_DEBUG_TRANSACTION, + " node %d u%016llx\n", + node->debug_id, (u64)node->ptr); + binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0); + } break; + case BINDER_TYPE_HANDLE: + case BINDER_TYPE_WEAK_HANDLE: { + struct binder_ref *ref = binder_get_ref(proc, fp->handle); + + if (ref == NULL) { + pr_err("transaction release %d bad handle %d\n", + debug_id, fp->handle); + break; + } + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d (node %d)\n", + ref->debug_id, ref->desc, ref->node->debug_id); + binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE); + } break; + + case BINDER_TYPE_FD: + binder_debug(BINDER_DEBUG_TRANSACTION, + " fd %d\n", fp->handle); + if (failed_at) + task_close_fd(proc, fp->handle); + break; + + default: + pr_err("transaction release %d bad object type %x\n", + debug_id, fp->type); + break; + } + } +} + +static void binder_transaction(struct binder_proc *proc, + struct binder_thread *thread, + struct binder_transaction_data *tr, int reply) +{ + struct binder_transaction *t; + struct binder_work *tcomplete; + binder_size_t *offp, *off_end; + struct binder_proc *target_proc; + struct binder_thread *target_thread = NULL; + struct binder_node *target_node = NULL; + struct list_head *target_list; + wait_queue_head_t *target_wait; + struct binder_transaction *in_reply_to = NULL; + struct binder_transaction_log_entry *e; + uint32_t return_error; + + e = binder_transaction_log_add(&binder_transaction_log); + e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); + e->from_proc = proc->pid; + e->from_thread = thread->pid; + e->target_handle = tr->target.handle; + e->data_size = tr->data_size; + e->offsets_size = tr->offsets_size; + + if (reply) { + in_reply_to = thread->transaction_stack; + if (in_reply_to == NULL) { + binder_user_error("%d:%d got reply transaction with no transaction stack\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_empty_call_stack; + } + binder_set_nice(in_reply_to->saved_priority); + if (in_reply_to->to_thread != thread) { + binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", + proc->pid, thread->pid, in_reply_to->debug_id, + in_reply_to->to_proc ? + in_reply_to->to_proc->pid : 0, + in_reply_to->to_thread ? + in_reply_to->to_thread->pid : 0); + return_error = BR_FAILED_REPLY; + in_reply_to = NULL; + goto err_bad_call_stack; + } + thread->transaction_stack = in_reply_to->to_parent; + target_thread = in_reply_to->from; + if (target_thread == NULL) { + return_error = BR_DEAD_REPLY; + goto err_dead_binder; + } + if (target_thread->transaction_stack != in_reply_to) { + binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n", + proc->pid, thread->pid, + target_thread->transaction_stack ? + target_thread->transaction_stack->debug_id : 0, + in_reply_to->debug_id); + return_error = BR_FAILED_REPLY; + in_reply_to = NULL; + target_thread = NULL; + goto err_dead_binder; + } + target_proc = target_thread->proc; + } else { + if (tr->target.handle) { + struct binder_ref *ref; + + ref = binder_get_ref(proc, tr->target.handle); + if (ref == NULL) { + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_invalid_target_handle; + } + target_node = ref->node; + } else { + target_node = binder_context_mgr_node; + if (target_node == NULL) { + return_error = BR_DEAD_REPLY; + goto err_no_context_mgr_node; + } + } + e->to_node = target_node->debug_id; + target_proc = target_node->proc; + if (target_proc == NULL) { + return_error = BR_DEAD_REPLY; + goto err_dead_binder; + } + if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { + struct binder_transaction *tmp; + + tmp = thread->transaction_stack; + if (tmp->to_thread != thread) { + binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", + proc->pid, thread->pid, tmp->debug_id, + tmp->to_proc ? tmp->to_proc->pid : 0, + tmp->to_thread ? + tmp->to_thread->pid : 0); + return_error = BR_FAILED_REPLY; + goto err_bad_call_stack; + } + while (tmp) { + if (tmp->from && tmp->from->proc == target_proc) + target_thread = tmp->from; + tmp = tmp->from_parent; + } + } + } + if (target_thread) { + e->to_thread = target_thread->pid; + target_list = &target_thread->todo; + target_wait = &target_thread->wait; + } else { + target_list = &target_proc->todo; + target_wait = &target_proc->wait; + } + e->to_proc = target_proc->pid; + + /* TODO: reuse incoming transaction for reply */ + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) { + return_error = BR_FAILED_REPLY; + goto err_alloc_t_failed; + } + binder_stats_created(BINDER_STAT_TRANSACTION); + + tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); + if (tcomplete == NULL) { + return_error = BR_FAILED_REPLY; + goto err_alloc_tcomplete_failed; + } + binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); + + t->debug_id = ++binder_last_id; + e->debug_id = t->debug_id; + + if (reply) + binder_debug(BINDER_DEBUG_TRANSACTION, + "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n", + proc->pid, thread->pid, t->debug_id, + target_proc->pid, target_thread->pid, + (u64)tr->data.ptr.buffer, + (u64)tr->data.ptr.offsets, + (u64)tr->data_size, (u64)tr->offsets_size); + else + binder_debug(BINDER_DEBUG_TRANSACTION, + "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n", + proc->pid, thread->pid, t->debug_id, + target_proc->pid, target_node->debug_id, + (u64)tr->data.ptr.buffer, + (u64)tr->data.ptr.offsets, + (u64)tr->data_size, (u64)tr->offsets_size); + + if (!reply && !(tr->flags & TF_ONE_WAY)) + t->from = thread; + else + t->from = NULL; + t->sender_euid = task_euid(proc->tsk); + t->to_proc = target_proc; + t->to_thread = target_thread; + t->code = tr->code; + t->flags = tr->flags; + t->priority = task_nice(current); + + trace_binder_transaction(reply, t, target_node); + + t->buffer = binder_alloc_buf(target_proc, tr->data_size, + tr->offsets_size, !reply && (t->flags & TF_ONE_WAY)); + if (t->buffer == NULL) { + return_error = BR_FAILED_REPLY; + goto err_binder_alloc_buf_failed; + } + t->buffer->allow_user_free = 0; + t->buffer->debug_id = t->debug_id; + t->buffer->transaction = t; + t->buffer->target_node = target_node; + trace_binder_transaction_alloc_buf(t->buffer); + if (target_node) + binder_inc_node(target_node, 1, 0, NULL); + + offp = (binder_size_t *)(t->buffer->data + + ALIGN(tr->data_size, sizeof(void *))); + + if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t) + tr->data.ptr.buffer, tr->data_size)) { + binder_user_error("%d:%d got transaction with invalid data ptr\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_copy_data_failed; + } + if (copy_from_user(offp, (const void __user *)(uintptr_t) + tr->data.ptr.offsets, tr->offsets_size)) { + binder_user_error("%d:%d got transaction with invalid offsets ptr\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; + goto err_copy_data_failed; + } + if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { + binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", + proc->pid, thread->pid, (u64)tr->offsets_size); + return_error = BR_FAILED_REPLY; + goto err_bad_offset; + } + off_end = (void *)offp + tr->offsets_size; + for (; offp < off_end; offp++) { + struct flat_binder_object *fp; + + if (*offp > t->buffer->data_size - sizeof(*fp) || + t->buffer->data_size < sizeof(*fp) || + !IS_ALIGNED(*offp, sizeof(u32))) { + binder_user_error("%d:%d got transaction with invalid offset, %lld\n", + proc->pid, thread->pid, (u64)*offp); + return_error = BR_FAILED_REPLY; + goto err_bad_offset; + } + fp = (struct flat_binder_object *)(t->buffer->data + *offp); + switch (fp->type) { + case BINDER_TYPE_BINDER: + case BINDER_TYPE_WEAK_BINDER: { + struct binder_ref *ref; + struct binder_node *node = binder_get_node(proc, fp->binder); + + if (node == NULL) { + node = binder_new_node(proc, fp->binder, fp->cookie); + if (node == NULL) { + return_error = BR_FAILED_REPLY; + goto err_binder_new_node_failed; + } + node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; + node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + } + if (fp->cookie != node->cookie) { + binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", + proc->pid, thread->pid, + (u64)fp->binder, node->debug_id, + (u64)fp->cookie, (u64)node->cookie); + return_error = BR_FAILED_REPLY; + goto err_binder_get_ref_for_node_failed; + } + ref = binder_get_ref_for_node(target_proc, node); + if (ref == NULL) { + return_error = BR_FAILED_REPLY; + goto err_binder_get_ref_for_node_failed; + } + if (fp->type == BINDER_TYPE_BINDER) + fp->type = BINDER_TYPE_HANDLE; + else + fp->type = BINDER_TYPE_WEAK_HANDLE; + fp->handle = ref->desc; + binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE, + &thread->todo); + + trace_binder_transaction_node_to_ref(t, node, ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " node %d u%016llx -> ref %d desc %d\n", + node->debug_id, (u64)node->ptr, + ref->debug_id, ref->desc); + } break; + case BINDER_TYPE_HANDLE: + case BINDER_TYPE_WEAK_HANDLE: { + struct binder_ref *ref = binder_get_ref(proc, fp->handle); + + if (ref == NULL) { + binder_user_error("%d:%d got transaction with invalid handle, %d\n", + proc->pid, + thread->pid, fp->handle); + return_error = BR_FAILED_REPLY; + goto err_binder_get_ref_failed; + } + if (ref->node->proc == target_proc) { + if (fp->type == BINDER_TYPE_HANDLE) + fp->type = BINDER_TYPE_BINDER; + else + fp->type = BINDER_TYPE_WEAK_BINDER; + fp->binder = ref->node->ptr; + fp->cookie = ref->node->cookie; + binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL); + trace_binder_transaction_ref_to_node(t, ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d -> node %d u%016llx\n", + ref->debug_id, ref->desc, ref->node->debug_id, + (u64)ref->node->ptr); + } else { + struct binder_ref *new_ref; + + new_ref = binder_get_ref_for_node(target_proc, ref->node); + if (new_ref == NULL) { + return_error = BR_FAILED_REPLY; + goto err_binder_get_ref_for_node_failed; + } + fp->handle = new_ref->desc; + binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); + trace_binder_transaction_ref_to_ref(t, ref, + new_ref); + binder_debug(BINDER_DEBUG_TRANSACTION, + " ref %d desc %d -> ref %d desc %d (node %d)\n", + ref->debug_id, ref->desc, new_ref->debug_id, + new_ref->desc, ref->node->debug_id); + } + } break; + + case BINDER_TYPE_FD: { + int target_fd; + struct file *file; + + if (reply) { + if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { + binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n", + proc->pid, thread->pid, fp->handle); + return_error = BR_FAILED_REPLY; + goto err_fd_not_allowed; + } + } else if (!target_node->accept_fds) { + binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n", + proc->pid, thread->pid, fp->handle); + return_error = BR_FAILED_REPLY; + goto err_fd_not_allowed; + } + + file = fget(fp->handle); + if (file == NULL) { + binder_user_error("%d:%d got transaction with invalid fd, %d\n", + proc->pid, thread->pid, fp->handle); + return_error = BR_FAILED_REPLY; + goto err_fget_failed; + } + target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); + if (target_fd < 0) { + fput(file); + return_error = BR_FAILED_REPLY; + goto err_get_unused_fd_failed; + } + task_fd_install(target_proc, target_fd, file); + trace_binder_transaction_fd(t, fp->handle, target_fd); + binder_debug(BINDER_DEBUG_TRANSACTION, + " fd %d -> %d\n", fp->handle, target_fd); + /* TODO: fput? */ + fp->handle = target_fd; + } break; + + default: + binder_user_error("%d:%d got transaction with invalid object type, %x\n", + proc->pid, thread->pid, fp->type); + return_error = BR_FAILED_REPLY; + goto err_bad_object_type; + } + } + if (reply) { + BUG_ON(t->buffer->async_transaction != 0); + binder_pop_transaction(target_thread, in_reply_to); + } else if (!(t->flags & TF_ONE_WAY)) { + BUG_ON(t->buffer->async_transaction != 0); + t->need_reply = 1; + t->from_parent = thread->transaction_stack; + thread->transaction_stack = t; + } else { + BUG_ON(target_node == NULL); + BUG_ON(t->buffer->async_transaction != 1); + if (target_node->has_async_transaction) { + target_list = &target_node->async_todo; + target_wait = NULL; + } else + target_node->has_async_transaction = 1; + } + t->work.type = BINDER_WORK_TRANSACTION; + list_add_tail(&t->work.entry, target_list); + tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + list_add_tail(&tcomplete->entry, &thread->todo); + if (target_wait) + wake_up_interruptible(target_wait); + return; + +err_get_unused_fd_failed: +err_fget_failed: +err_fd_not_allowed: +err_binder_get_ref_for_node_failed: +err_binder_get_ref_failed: +err_binder_new_node_failed: +err_bad_object_type: +err_bad_offset: +err_copy_data_failed: + trace_binder_transaction_failed_buffer_release(t->buffer); + binder_transaction_buffer_release(target_proc, t->buffer, offp); + t->buffer->transaction = NULL; + binder_free_buf(target_proc, t->buffer); +err_binder_alloc_buf_failed: + kfree(tcomplete); + binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); +err_alloc_tcomplete_failed: + kfree(t); + binder_stats_deleted(BINDER_STAT_TRANSACTION); +err_alloc_t_failed: +err_bad_call_stack: +err_empty_call_stack: +err_dead_binder: +err_invalid_target_handle: +err_no_context_mgr_node: + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "%d:%d transaction failed %d, size %lld-%lld\n", + proc->pid, thread->pid, return_error, + (u64)tr->data_size, (u64)tr->offsets_size); + + { + struct binder_transaction_log_entry *fe; + + fe = binder_transaction_log_add(&binder_transaction_log_failed); + *fe = *e; + } + + BUG_ON(thread->return_error != BR_OK); + if (in_reply_to) { + thread->return_error = BR_TRANSACTION_COMPLETE; + binder_send_failed_reply(in_reply_to, return_error); + } else + thread->return_error = return_error; +} + +static int binder_thread_write(struct binder_proc *proc, + struct binder_thread *thread, + binder_uintptr_t binder_buffer, size_t size, + binder_size_t *consumed) +{ + uint32_t cmd; + void __user *buffer = (void __user *)(uintptr_t)binder_buffer; + void __user *ptr = buffer + *consumed; + void __user *end = buffer + size; + + while (ptr < end && thread->return_error == BR_OK) { + if (get_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + trace_binder_command(cmd); + if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { + binder_stats.bc[_IOC_NR(cmd)]++; + proc->stats.bc[_IOC_NR(cmd)]++; + thread->stats.bc[_IOC_NR(cmd)]++; + } + switch (cmd) { + case BC_INCREFS: + case BC_ACQUIRE: + case BC_RELEASE: + case BC_DECREFS: { + uint32_t target; + struct binder_ref *ref; + const char *debug_string; + + if (get_user(target, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + if (target == 0 && binder_context_mgr_node && + (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) { + ref = binder_get_ref_for_node(proc, + binder_context_mgr_node); + if (ref->desc != target) { + binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", + proc->pid, thread->pid, + ref->desc); + } + } else + ref = binder_get_ref(proc, target); + if (ref == NULL) { + binder_user_error("%d:%d refcount change on invalid ref %d\n", + proc->pid, thread->pid, target); + break; + } + switch (cmd) { + case BC_INCREFS: + debug_string = "IncRefs"; + binder_inc_ref(ref, 0, NULL); + break; + case BC_ACQUIRE: + debug_string = "Acquire"; + binder_inc_ref(ref, 1, NULL); + break; + case BC_RELEASE: + debug_string = "Release"; + binder_dec_ref(ref, 1); + break; + case BC_DECREFS: + default: + debug_string = "DecRefs"; + binder_dec_ref(ref, 0); + break; + } + binder_debug(BINDER_DEBUG_USER_REFS, + "%d:%d %s ref %d desc %d s %d w %d for node %d\n", + proc->pid, thread->pid, debug_string, ref->debug_id, + ref->desc, ref->strong, ref->weak, ref->node->debug_id); + break; + } + case BC_INCREFS_DONE: + case BC_ACQUIRE_DONE: { + binder_uintptr_t node_ptr; + binder_uintptr_t cookie; + struct binder_node *node; + + if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + if (get_user(cookie, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + node = binder_get_node(proc, node_ptr); + if (node == NULL) { + binder_user_error("%d:%d %s u%016llx no match\n", + proc->pid, thread->pid, + cmd == BC_INCREFS_DONE ? + "BC_INCREFS_DONE" : + "BC_ACQUIRE_DONE", + (u64)node_ptr); + break; + } + if (cookie != node->cookie) { + binder_user_error("%d:%d %s u%016llx node %d cookie mismatch %016llx != %016llx\n", + proc->pid, thread->pid, + cmd == BC_INCREFS_DONE ? + "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", + (u64)node_ptr, node->debug_id, + (u64)cookie, (u64)node->cookie); + break; + } + if (cmd == BC_ACQUIRE_DONE) { + if (node->pending_strong_ref == 0) { + binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", + proc->pid, thread->pid, + node->debug_id); + break; + } + node->pending_strong_ref = 0; + } else { + if (node->pending_weak_ref == 0) { + binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", + proc->pid, thread->pid, + node->debug_id); + break; + } + node->pending_weak_ref = 0; + } + binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0); + binder_debug(BINDER_DEBUG_USER_REFS, + "%d:%d %s node %d ls %d lw %d\n", + proc->pid, thread->pid, + cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", + node->debug_id, node->local_strong_refs, node->local_weak_refs); + break; + } + case BC_ATTEMPT_ACQUIRE: + pr_err("BC_ATTEMPT_ACQUIRE not supported\n"); + return -EINVAL; + case BC_ACQUIRE_RESULT: + pr_err("BC_ACQUIRE_RESULT not supported\n"); + return -EINVAL; + + case BC_FREE_BUFFER: { + binder_uintptr_t data_ptr; + struct binder_buffer *buffer; + + if (get_user(data_ptr, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + + buffer = binder_buffer_lookup(proc, data_ptr); + if (buffer == NULL) { + binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n", + proc->pid, thread->pid, (u64)data_ptr); + break; + } + if (!buffer->allow_user_free) { + binder_user_error("%d:%d BC_FREE_BUFFER u%016llx matched unreturned buffer\n", + proc->pid, thread->pid, (u64)data_ptr); + break; + } + binder_debug(BINDER_DEBUG_FREE_BUFFER, + "%d:%d BC_FREE_BUFFER u%016llx found buffer %d for %s transaction\n", + proc->pid, thread->pid, (u64)data_ptr, + buffer->debug_id, + buffer->transaction ? "active" : "finished"); + + if (buffer->transaction) { + buffer->transaction->buffer = NULL; + buffer->transaction = NULL; + } + if (buffer->async_transaction && buffer->target_node) { + BUG_ON(!buffer->target_node->has_async_transaction); + if (list_empty(&buffer->target_node->async_todo)) + buffer->target_node->has_async_transaction = 0; + else + list_move_tail(buffer->target_node->async_todo.next, &thread->todo); + } + trace_binder_transaction_buffer_release(buffer); + binder_transaction_buffer_release(proc, buffer, NULL); + binder_free_buf(proc, buffer); + break; + } + + case BC_TRANSACTION: + case BC_REPLY: { + struct binder_transaction_data tr; + + if (copy_from_user(&tr, ptr, sizeof(tr))) + return -EFAULT; + ptr += sizeof(tr); + binder_transaction(proc, thread, &tr, cmd == BC_REPLY); + break; + } + + case BC_REGISTER_LOOPER: + binder_debug(BINDER_DEBUG_THREADS, + "%d:%d BC_REGISTER_LOOPER\n", + proc->pid, thread->pid); + if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { + thread->looper |= BINDER_LOOPER_STATE_INVALID; + binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", + proc->pid, thread->pid); + } else if (proc->requested_threads == 0) { + thread->looper |= BINDER_LOOPER_STATE_INVALID; + binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n", + proc->pid, thread->pid); + } else { + proc->requested_threads--; + proc->requested_threads_started++; + } + thread->looper |= BINDER_LOOPER_STATE_REGISTERED; + break; + case BC_ENTER_LOOPER: + binder_debug(BINDER_DEBUG_THREADS, + "%d:%d BC_ENTER_LOOPER\n", + proc->pid, thread->pid); + if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) { + thread->looper |= BINDER_LOOPER_STATE_INVALID; + binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n", + proc->pid, thread->pid); + } + thread->looper |= BINDER_LOOPER_STATE_ENTERED; + break; + case BC_EXIT_LOOPER: + binder_debug(BINDER_DEBUG_THREADS, + "%d:%d BC_EXIT_LOOPER\n", + proc->pid, thread->pid); + thread->looper |= BINDER_LOOPER_STATE_EXITED; + break; + + case BC_REQUEST_DEATH_NOTIFICATION: + case BC_CLEAR_DEATH_NOTIFICATION: { + uint32_t target; + binder_uintptr_t cookie; + struct binder_ref *ref; + struct binder_ref_death *death; + + if (get_user(target, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + if (get_user(cookie, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + ref = binder_get_ref(proc, target); + if (ref == NULL) { + binder_user_error("%d:%d %s invalid ref %d\n", + proc->pid, thread->pid, + cmd == BC_REQUEST_DEATH_NOTIFICATION ? + "BC_REQUEST_DEATH_NOTIFICATION" : + "BC_CLEAR_DEATH_NOTIFICATION", + target); + break; + } + + binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, + "%d:%d %s %016llx ref %d desc %d s %d w %d for node %d\n", + proc->pid, thread->pid, + cmd == BC_REQUEST_DEATH_NOTIFICATION ? + "BC_REQUEST_DEATH_NOTIFICATION" : + "BC_CLEAR_DEATH_NOTIFICATION", + (u64)cookie, ref->debug_id, ref->desc, + ref->strong, ref->weak, ref->node->debug_id); + + if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { + if (ref->death) { + binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", + proc->pid, thread->pid); + break; + } + death = kzalloc(sizeof(*death), GFP_KERNEL); + if (death == NULL) { + thread->return_error = BR_ERROR; + binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, + "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", + proc->pid, thread->pid); + break; + } + binder_stats_created(BINDER_STAT_DEATH); + INIT_LIST_HEAD(&death->work.entry); + death->cookie = cookie; + ref->death = death; + if (ref->node->proc == NULL) { + ref->death->work.type = BINDER_WORK_DEAD_BINDER; + if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { + list_add_tail(&ref->death->work.entry, &thread->todo); + } else { + list_add_tail(&ref->death->work.entry, &proc->todo); + wake_up_interruptible(&proc->wait); + } + } + } else { + if (ref->death == NULL) { + binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", + proc->pid, thread->pid); + break; + } + death = ref->death; + if (death->cookie != cookie) { + binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %016llx != %016llx\n", + proc->pid, thread->pid, + (u64)death->cookie, + (u64)cookie); + break; + } + ref->death = NULL; + if (list_empty(&death->work.entry)) { + death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; + if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { + list_add_tail(&death->work.entry, &thread->todo); + } else { + list_add_tail(&death->work.entry, &proc->todo); + wake_up_interruptible(&proc->wait); + } + } else { + BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); + death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; + } + } + } break; + case BC_DEAD_BINDER_DONE: { + struct binder_work *w; + binder_uintptr_t cookie; + struct binder_ref_death *death = NULL; + + if (get_user(cookie, (binder_uintptr_t __user *)ptr)) + return -EFAULT; + + ptr += sizeof(void *); + list_for_each_entry(w, &proc->delivered_death, entry) { + struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); + + if (tmp_death->cookie == cookie) { + death = tmp_death; + break; + } + } + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "%d:%d BC_DEAD_BINDER_DONE %016llx found %p\n", + proc->pid, thread->pid, (u64)cookie, + death); + if (death == NULL) { + binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", + proc->pid, thread->pid, (u64)cookie); + break; + } + + list_del_init(&death->work.entry); + if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { + death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; + if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { + list_add_tail(&death->work.entry, &thread->todo); + } else { + list_add_tail(&death->work.entry, &proc->todo); + wake_up_interruptible(&proc->wait); + } + } + } break; + + default: + pr_err("%d:%d unknown command %d\n", + proc->pid, thread->pid, cmd); + return -EINVAL; + } + *consumed = ptr - buffer; + } + return 0; +} + +static void binder_stat_br(struct binder_proc *proc, + struct binder_thread *thread, uint32_t cmd) +{ + trace_binder_return(cmd); + if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { + binder_stats.br[_IOC_NR(cmd)]++; + proc->stats.br[_IOC_NR(cmd)]++; + thread->stats.br[_IOC_NR(cmd)]++; + } +} + +static int binder_has_proc_work(struct binder_proc *proc, + struct binder_thread *thread) +{ + return !list_empty(&proc->todo) || + (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); +} + +static int binder_has_thread_work(struct binder_thread *thread) +{ + return !list_empty(&thread->todo) || thread->return_error != BR_OK || + (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); +} + +static int binder_thread_read(struct binder_proc *proc, + struct binder_thread *thread, + binder_uintptr_t binder_buffer, size_t size, + binder_size_t *consumed, int non_block) +{ + void __user *buffer = (void __user *)(uintptr_t)binder_buffer; + void __user *ptr = buffer + *consumed; + void __user *end = buffer + size; + + int ret = 0; + int wait_for_proc_work; + + if (*consumed == 0) { + if (put_user(BR_NOOP, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + } + +retry: + wait_for_proc_work = thread->transaction_stack == NULL && + list_empty(&thread->todo); + + if (thread->return_error != BR_OK && ptr < end) { + if (thread->return_error2 != BR_OK) { + if (put_user(thread->return_error2, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + binder_stat_br(proc, thread, thread->return_error2); + if (ptr == end) + goto done; + thread->return_error2 = BR_OK; + } + if (put_user(thread->return_error, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + binder_stat_br(proc, thread, thread->return_error); + thread->return_error = BR_OK; + goto done; + } + + + thread->looper |= BINDER_LOOPER_STATE_WAITING; + if (wait_for_proc_work) + proc->ready_threads++; + + binder_unlock(__func__); + + trace_binder_wait_for_work(wait_for_proc_work, + !!thread->transaction_stack, + !list_empty(&thread->todo)); + if (wait_for_proc_work) { + if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | + BINDER_LOOPER_STATE_ENTERED))) { + binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n", + proc->pid, thread->pid, thread->looper); + wait_event_interruptible(binder_user_error_wait, + binder_stop_on_user_error < 2); + } + binder_set_nice(proc->default_priority); + if (non_block) { + if (!binder_has_proc_work(proc, thread)) + ret = -EAGAIN; + } else + ret = wait_event_freezable_exclusive(proc->wait, binder_has_proc_work(proc, thread)); + } else { + if (non_block) { + if (!binder_has_thread_work(thread)) + ret = -EAGAIN; + } else + ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread)); + } + + binder_lock(__func__); + + if (wait_for_proc_work) + proc->ready_threads--; + thread->looper &= ~BINDER_LOOPER_STATE_WAITING; + + if (ret) + return ret; + + while (1) { + uint32_t cmd; + struct binder_transaction_data tr; + struct binder_work *w; + struct binder_transaction *t = NULL; + + if (!list_empty(&thread->todo)) { + w = list_first_entry(&thread->todo, struct binder_work, + entry); + } else if (!list_empty(&proc->todo) && wait_for_proc_work) { + w = list_first_entry(&proc->todo, struct binder_work, + entry); + } else { + /* no data added */ + if (ptr - buffer == 4 && + !(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN)) + goto retry; + break; + } + + if (end - ptr < sizeof(tr) + 4) + break; + + switch (w->type) { + case BINDER_WORK_TRANSACTION: { + t = container_of(w, struct binder_transaction, work); + } break; + case BINDER_WORK_TRANSACTION_COMPLETE: { + cmd = BR_TRANSACTION_COMPLETE; + if (put_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + + binder_stat_br(proc, thread, cmd); + binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, + "%d:%d BR_TRANSACTION_COMPLETE\n", + proc->pid, thread->pid); + + list_del(&w->entry); + kfree(w); + binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); + } break; + case BINDER_WORK_NODE: { + struct binder_node *node = container_of(w, struct binder_node, work); + uint32_t cmd = BR_NOOP; + const char *cmd_name; + int strong = node->internal_strong_refs || node->local_strong_refs; + int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong; + + if (weak && !node->has_weak_ref) { + cmd = BR_INCREFS; + cmd_name = "BR_INCREFS"; + node->has_weak_ref = 1; + node->pending_weak_ref = 1; + node->local_weak_refs++; + } else if (strong && !node->has_strong_ref) { + cmd = BR_ACQUIRE; + cmd_name = "BR_ACQUIRE"; + node->has_strong_ref = 1; + node->pending_strong_ref = 1; + node->local_strong_refs++; + } else if (!strong && node->has_strong_ref) { + cmd = BR_RELEASE; + cmd_name = "BR_RELEASE"; + node->has_strong_ref = 0; + } else if (!weak && node->has_weak_ref) { + cmd = BR_DECREFS; + cmd_name = "BR_DECREFS"; + node->has_weak_ref = 0; + } + if (cmd != BR_NOOP) { + if (put_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + if (put_user(node->ptr, + (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + if (put_user(node->cookie, + (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + + binder_stat_br(proc, thread, cmd); + binder_debug(BINDER_DEBUG_USER_REFS, + "%d:%d %s %d u%016llx c%016llx\n", + proc->pid, thread->pid, cmd_name, + node->debug_id, + (u64)node->ptr, (u64)node->cookie); + } else { + list_del_init(&w->entry); + if (!weak && !strong) { + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d:%d node %d u%016llx c%016llx deleted\n", + proc->pid, thread->pid, + node->debug_id, + (u64)node->ptr, + (u64)node->cookie); + rb_erase(&node->rb_node, &proc->nodes); + kfree(node); + binder_stats_deleted(BINDER_STAT_NODE); + } else { + binder_debug(BINDER_DEBUG_INTERNAL_REFS, + "%d:%d node %d u%016llx c%016llx state unchanged\n", + proc->pid, thread->pid, + node->debug_id, + (u64)node->ptr, + (u64)node->cookie); + } + } + } break; + case BINDER_WORK_DEAD_BINDER: + case BINDER_WORK_DEAD_BINDER_AND_CLEAR: + case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { + struct binder_ref_death *death; + uint32_t cmd; + + death = container_of(w, struct binder_ref_death, work); + if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) + cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; + else + cmd = BR_DEAD_BINDER; + if (put_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + if (put_user(death->cookie, + (binder_uintptr_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(binder_uintptr_t); + binder_stat_br(proc, thread, cmd); + binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, + "%d:%d %s %016llx\n", + proc->pid, thread->pid, + cmd == BR_DEAD_BINDER ? + "BR_DEAD_BINDER" : + "BR_CLEAR_DEATH_NOTIFICATION_DONE", + (u64)death->cookie); + + if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { + list_del(&w->entry); + kfree(death); + binder_stats_deleted(BINDER_STAT_DEATH); + } else + list_move(&w->entry, &proc->delivered_death); + if (cmd == BR_DEAD_BINDER) + goto done; /* DEAD_BINDER notifications can cause transactions */ + } break; + } + + if (!t) + continue; + + BUG_ON(t->buffer == NULL); + if (t->buffer->target_node) { + struct binder_node *target_node = t->buffer->target_node; + + tr.target.ptr = target_node->ptr; + tr.cookie = target_node->cookie; + t->saved_priority = task_nice(current); + if (t->priority < target_node->min_priority && + !(t->flags & TF_ONE_WAY)) + binder_set_nice(t->priority); + else if (!(t->flags & TF_ONE_WAY) || + t->saved_priority > target_node->min_priority) + binder_set_nice(target_node->min_priority); + cmd = BR_TRANSACTION; + } else { + tr.target.ptr = 0; + tr.cookie = 0; + cmd = BR_REPLY; + } + tr.code = t->code; + tr.flags = t->flags; + tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid); + + if (t->from) { + struct task_struct *sender = t->from->proc->tsk; + + tr.sender_pid = task_tgid_nr_ns(sender, + task_active_pid_ns(current)); + } else { + tr.sender_pid = 0; + } + + tr.data_size = t->buffer->data_size; + tr.offsets_size = t->buffer->offsets_size; + tr.data.ptr.buffer = (binder_uintptr_t)( + (uintptr_t)t->buffer->data + + proc->user_buffer_offset); + tr.data.ptr.offsets = tr.data.ptr.buffer + + ALIGN(t->buffer->data_size, + sizeof(void *)); + + if (put_user(cmd, (uint32_t __user *)ptr)) + return -EFAULT; + ptr += sizeof(uint32_t); + if (copy_to_user(ptr, &tr, sizeof(tr))) + return -EFAULT; + ptr += sizeof(tr); + + trace_binder_transaction_received(t); + binder_stat_br(proc, thread, cmd); + binder_debug(BINDER_DEBUG_TRANSACTION, + "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %016llx-%016llx\n", + proc->pid, thread->pid, + (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : + "BR_REPLY", + t->debug_id, t->from ? t->from->proc->pid : 0, + t->from ? t->from->pid : 0, cmd, + t->buffer->data_size, t->buffer->offsets_size, + (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets); + + list_del(&t->work.entry); + t->buffer->allow_user_free = 1; + if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { + t->to_parent = thread->transaction_stack; + t->to_thread = thread; + thread->transaction_stack = t; + } else { + t->buffer->transaction = NULL; + kfree(t); + binder_stats_deleted(BINDER_STAT_TRANSACTION); + } + break; + } + +done: + + *consumed = ptr - buffer; + if (proc->requested_threads + proc->ready_threads == 0 && + proc->requested_threads_started < proc->max_threads && + (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | + BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ + /*spawn a new thread if we leave this out */) { + proc->requested_threads++; + binder_debug(BINDER_DEBUG_THREADS, + "%d:%d BR_SPAWN_LOOPER\n", + proc->pid, thread->pid); + if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) + return -EFAULT; + binder_stat_br(proc, thread, BR_SPAWN_LOOPER); + } + return 0; +} + +static void binder_release_work(struct list_head *list) +{ + struct binder_work *w; + + while (!list_empty(list)) { + w = list_first_entry(list, struct binder_work, entry); + list_del_init(&w->entry); + switch (w->type) { + case BINDER_WORK_TRANSACTION: { + struct binder_transaction *t; + + t = container_of(w, struct binder_transaction, work); + if (t->buffer->target_node && + !(t->flags & TF_ONE_WAY)) { + binder_send_failed_reply(t, BR_DEAD_REPLY); + } else { + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered transaction %d\n", + t->debug_id); + t->buffer->transaction = NULL; + kfree(t); + binder_stats_deleted(BINDER_STAT_TRANSACTION); + } + } break; + case BINDER_WORK_TRANSACTION_COMPLETE: { + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered TRANSACTION_COMPLETE\n"); + kfree(w); + binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); + } break; + case BINDER_WORK_DEAD_BINDER_AND_CLEAR: + case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { + struct binder_ref_death *death; + + death = container_of(w, struct binder_ref_death, work); + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered death notification, %016llx\n", + (u64)death->cookie); + kfree(death); + binder_stats_deleted(BINDER_STAT_DEATH); + } break; + default: + pr_err("unexpected work type, %d, not freed\n", + w->type); + break; + } + } + +} + +static struct binder_thread *binder_get_thread(struct binder_proc *proc) +{ + struct binder_thread *thread = NULL; + struct rb_node *parent = NULL; + struct rb_node **p = &proc->threads.rb_node; + + while (*p) { + parent = *p; + thread = rb_entry(parent, struct binder_thread, rb_node); + + if (current->pid < thread->pid) + p = &(*p)->rb_left; + else if (current->pid > thread->pid) + p = &(*p)->rb_right; + else + break; + } + if (*p == NULL) { + thread = kzalloc(sizeof(*thread), GFP_KERNEL); + if (thread == NULL) + return NULL; + binder_stats_created(BINDER_STAT_THREAD); + thread->proc = proc; + thread->pid = current->pid; + init_waitqueue_head(&thread->wait); + INIT_LIST_HEAD(&thread->todo); + rb_link_node(&thread->rb_node, parent, p); + rb_insert_color(&thread->rb_node, &proc->threads); + thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN; + thread->return_error = BR_OK; + thread->return_error2 = BR_OK; + } + return thread; +} + +static int binder_free_thread(struct binder_proc *proc, + struct binder_thread *thread) +{ + struct binder_transaction *t; + struct binder_transaction *send_reply = NULL; + int active_transactions = 0; + + rb_erase(&thread->rb_node, &proc->threads); + t = thread->transaction_stack; + if (t && t->to_thread == thread) + send_reply = t; + while (t) { + active_transactions++; + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "release %d:%d transaction %d %s, still active\n", + proc->pid, thread->pid, + t->debug_id, + (t->to_thread == thread) ? "in" : "out"); + + if (t->to_thread == thread) { + t->to_proc = NULL; + t->to_thread = NULL; + if (t->buffer) { + t->buffer->transaction = NULL; + t->buffer = NULL; + } + t = t->to_parent; + } else if (t->from == thread) { + t->from = NULL; + t = t->from_parent; + } else + BUG(); + } + if (send_reply) + binder_send_failed_reply(send_reply, BR_DEAD_REPLY); + binder_release_work(&thread->todo); + kfree(thread); + binder_stats_deleted(BINDER_STAT_THREAD); + return active_transactions; +} + +static unsigned int binder_poll(struct file *filp, + struct poll_table_struct *wait) +{ + struct binder_proc *proc = filp->private_data; + struct binder_thread *thread = NULL; + int wait_for_proc_work; + + binder_lock(__func__); + + thread = binder_get_thread(proc); + + wait_for_proc_work = thread->transaction_stack == NULL && + list_empty(&thread->todo) && thread->return_error == BR_OK; + + binder_unlock(__func__); + + if (wait_for_proc_work) { + if (binder_has_proc_work(proc, thread)) + return POLLIN; + poll_wait(filp, &proc->wait, wait); + if (binder_has_proc_work(proc, thread)) + return POLLIN; + } else { + if (binder_has_thread_work(thread)) + return POLLIN; + poll_wait(filp, &thread->wait, wait); + if (binder_has_thread_work(thread)) + return POLLIN; + } + return 0; +} + +static int binder_ioctl_write_read(struct file *filp, + unsigned int cmd, unsigned long arg, + struct binder_thread *thread) +{ + int ret = 0; + struct binder_proc *proc = filp->private_data; + unsigned int size = _IOC_SIZE(cmd); + void __user *ubuf = (void __user *)arg; + struct binder_write_read bwr; + + if (size != sizeof(struct binder_write_read)) { + ret = -EINVAL; + goto out; + } + if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { + ret = -EFAULT; + goto out; + } + binder_debug(BINDER_DEBUG_READ_WRITE, + "%d:%d write %lld at %016llx, read %lld at %016llx\n", + proc->pid, thread->pid, + (u64)bwr.write_size, (u64)bwr.write_buffer, + (u64)bwr.read_size, (u64)bwr.read_buffer); + + if (bwr.write_size > 0) { + ret = binder_thread_write(proc, thread, + bwr.write_buffer, + bwr.write_size, + &bwr.write_consumed); + trace_binder_write_done(ret); + if (ret < 0) { + bwr.read_consumed = 0; + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) + ret = -EFAULT; + goto out; + } + } + if (bwr.read_size > 0) { + ret = binder_thread_read(proc, thread, bwr.read_buffer, + bwr.read_size, + &bwr.read_consumed, + filp->f_flags & O_NONBLOCK); + trace_binder_read_done(ret); + if (!list_empty(&proc->todo)) + wake_up_interruptible(&proc->wait); + if (ret < 0) { + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) + ret = -EFAULT; + goto out; + } + } + binder_debug(BINDER_DEBUG_READ_WRITE, + "%d:%d wrote %lld of %lld, read return %lld of %lld\n", + proc->pid, thread->pid, + (u64)bwr.write_consumed, (u64)bwr.write_size, + (u64)bwr.read_consumed, (u64)bwr.read_size); + if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { + ret = -EFAULT; + goto out; + } +out: + return ret; +} + +static int binder_ioctl_set_ctx_mgr(struct file *filp) +{ + int ret = 0; + struct binder_proc *proc = filp->private_data; + kuid_t curr_euid = current_euid(); + + if (binder_context_mgr_node != NULL) { + pr_err("BINDER_SET_CONTEXT_MGR already set\n"); + ret = -EBUSY; + goto out; + } + if (uid_valid(binder_context_mgr_uid)) { + if (!uid_eq(binder_context_mgr_uid, curr_euid)) { + pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", + from_kuid(&init_user_ns, curr_euid), + from_kuid(&init_user_ns, + binder_context_mgr_uid)); + ret = -EPERM; + goto out; + } + } else { + binder_context_mgr_uid = curr_euid; + } + binder_context_mgr_node = binder_new_node(proc, 0, 0); + if (binder_context_mgr_node == NULL) { + ret = -ENOMEM; + goto out; + } + binder_context_mgr_node->local_weak_refs++; + binder_context_mgr_node->local_strong_refs++; + binder_context_mgr_node->has_strong_ref = 1; + binder_context_mgr_node->has_weak_ref = 1; +out: + return ret; +} + +static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + int ret; + struct binder_proc *proc = filp->private_data; + struct binder_thread *thread; + unsigned int size = _IOC_SIZE(cmd); + void __user *ubuf = (void __user *)arg; + + /*pr_info("binder_ioctl: %d:%d %x %lx\n", + proc->pid, current->pid, cmd, arg);*/ + + trace_binder_ioctl(cmd, arg); + + ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); + if (ret) + goto err_unlocked; + + binder_lock(__func__); + thread = binder_get_thread(proc); + if (thread == NULL) { + ret = -ENOMEM; + goto err; + } + + switch (cmd) { + case BINDER_WRITE_READ: + ret = binder_ioctl_write_read(filp, cmd, arg, thread); + if (ret) + goto err; + break; + case BINDER_SET_MAX_THREADS: + if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) { + ret = -EINVAL; + goto err; + } + break; + case BINDER_SET_CONTEXT_MGR: + ret = binder_ioctl_set_ctx_mgr(filp); + if (ret) + goto err; + break; + case BINDER_THREAD_EXIT: + binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", + proc->pid, thread->pid); + binder_free_thread(proc, thread); + thread = NULL; + break; + case BINDER_VERSION: { + struct binder_version __user *ver = ubuf; + + if (size != sizeof(struct binder_version)) { + ret = -EINVAL; + goto err; + } + if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, + &ver->protocol_version)) { + ret = -EINVAL; + goto err; + } + break; + } + default: + ret = -EINVAL; + goto err; + } + ret = 0; +err: + if (thread) + thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN; + binder_unlock(__func__); + wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); + if (ret && ret != -ERESTARTSYS) + pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); +err_unlocked: + trace_binder_ioctl_done(ret); + return ret; +} + +static void binder_vma_open(struct vm_area_struct *vma) +{ + struct binder_proc *proc = vma->vm_private_data; + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, + "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", + proc->pid, vma->vm_start, vma->vm_end, + (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, + (unsigned long)pgprot_val(vma->vm_page_prot)); +} + +static void binder_vma_close(struct vm_area_struct *vma) +{ + struct binder_proc *proc = vma->vm_private_data; + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, + "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", + proc->pid, vma->vm_start, vma->vm_end, + (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, + (unsigned long)pgprot_val(vma->vm_page_prot)); + proc->vma = NULL; + proc->vma_vm_mm = NULL; + binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); +} + +static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct binder_vm_ops = { + .open = binder_vma_open, + .close = binder_vma_close, + .fault = binder_vm_fault, +}; + +static int binder_mmap(struct file *filp, struct vm_area_struct *vma) +{ + int ret; + struct vm_struct *area; + struct binder_proc *proc = filp->private_data; + const char *failure_string; + struct binder_buffer *buffer; + + if (proc->tsk != current) + return -EINVAL; + + if ((vma->vm_end - vma->vm_start) > SZ_4M) + vma->vm_end = vma->vm_start + SZ_4M; + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, + "binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", + proc->pid, vma->vm_start, vma->vm_end, + (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, + (unsigned long)pgprot_val(vma->vm_page_prot)); + + if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { + ret = -EPERM; + failure_string = "bad vm_flags"; + goto err_bad_arg; + } + vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; + + mutex_lock(&binder_mmap_lock); + if (proc->buffer) { + ret = -EBUSY; + failure_string = "already mapped"; + goto err_already_mapped; + } + + area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP); + if (area == NULL) { + ret = -ENOMEM; + failure_string = "get_vm_area"; + goto err_get_vm_area_failed; + } + proc->buffer = area->addr; + proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer; + mutex_unlock(&binder_mmap_lock); + +#ifdef CONFIG_CPU_CACHE_VIPT + if (cache_is_vipt_aliasing()) { + while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) { + pr_info("binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer); + vma->vm_start += PAGE_SIZE; + } + } +#endif + proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); + if (proc->pages == NULL) { + ret = -ENOMEM; + failure_string = "alloc page array"; + goto err_alloc_pages_failed; + } + proc->buffer_size = vma->vm_end - vma->vm_start; + + vma->vm_ops = &binder_vm_ops; + vma->vm_private_data = proc; + + if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) { + ret = -ENOMEM; + failure_string = "alloc small buf"; + goto err_alloc_small_buf_failed; + } + buffer = proc->buffer; + INIT_LIST_HEAD(&proc->buffers); + list_add(&buffer->entry, &proc->buffers); + buffer->free = 1; + binder_insert_free_buffer(proc, buffer); + proc->free_async_space = proc->buffer_size / 2; + barrier(); + proc->files = get_files_struct(current); + proc->vma = vma; + proc->vma_vm_mm = vma->vm_mm; + + /*pr_info("binder_mmap: %d %lx-%lx maps %p\n", + proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/ + return 0; + +err_alloc_small_buf_failed: + kfree(proc->pages); + proc->pages = NULL; +err_alloc_pages_failed: + mutex_lock(&binder_mmap_lock); + vfree(proc->buffer); + proc->buffer = NULL; +err_get_vm_area_failed: +err_already_mapped: + mutex_unlock(&binder_mmap_lock); +err_bad_arg: + pr_err("binder_mmap: %d %lx-%lx %s failed %d\n", + proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); + return ret; +} + +static int binder_open(struct inode *nodp, struct file *filp) +{ + struct binder_proc *proc; + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n", + current->group_leader->pid, current->pid); + + proc = kzalloc(sizeof(*proc), GFP_KERNEL); + if (proc == NULL) + return -ENOMEM; + get_task_struct(current); + proc->tsk = current; + INIT_LIST_HEAD(&proc->todo); + init_waitqueue_head(&proc->wait); + proc->default_priority = task_nice(current); + + binder_lock(__func__); + + binder_stats_created(BINDER_STAT_PROC); + hlist_add_head(&proc->proc_node, &binder_procs); + proc->pid = current->group_leader->pid; + INIT_LIST_HEAD(&proc->delivered_death); + filp->private_data = proc; + + binder_unlock(__func__); + + if (binder_debugfs_dir_entry_proc) { + char strbuf[11]; + + snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); + proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO, + binder_debugfs_dir_entry_proc, proc, &binder_proc_fops); + } + + return 0; +} + +static int binder_flush(struct file *filp, fl_owner_t id) +{ + struct binder_proc *proc = filp->private_data; + + binder_defer_work(proc, BINDER_DEFERRED_FLUSH); + + return 0; +} + +static void binder_deferred_flush(struct binder_proc *proc) +{ + struct rb_node *n; + int wake_count = 0; + + for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { + struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); + + thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN; + if (thread->looper & BINDER_LOOPER_STATE_WAITING) { + wake_up_interruptible(&thread->wait); + wake_count++; + } + } + wake_up_interruptible_all(&proc->wait); + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, + "binder_flush: %d woke %d threads\n", proc->pid, + wake_count); +} + +static int binder_release(struct inode *nodp, struct file *filp) +{ + struct binder_proc *proc = filp->private_data; + + debugfs_remove(proc->debugfs_entry); + binder_defer_work(proc, BINDER_DEFERRED_RELEASE); + + return 0; +} + +static int binder_node_release(struct binder_node *node, int refs) +{ + struct binder_ref *ref; + int death = 0; + + list_del_init(&node->work.entry); + binder_release_work(&node->async_todo); + + if (hlist_empty(&node->refs)) { + kfree(node); + binder_stats_deleted(BINDER_STAT_NODE); + + return refs; + } + + node->proc = NULL; + node->local_strong_refs = 0; + node->local_weak_refs = 0; + hlist_add_head(&node->dead_node, &binder_dead_nodes); + + hlist_for_each_entry(ref, &node->refs, node_entry) { + refs++; + + if (!ref->death) + continue; + + death++; + + if (list_empty(&ref->death->work.entry)) { + ref->death->work.type = BINDER_WORK_DEAD_BINDER; + list_add_tail(&ref->death->work.entry, + &ref->proc->todo); + wake_up_interruptible(&ref->proc->wait); + } else + BUG(); + } + + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "node %d now dead, refs %d, death %d\n", + node->debug_id, refs, death); + + return refs; +} + +static void binder_deferred_release(struct binder_proc *proc) +{ + struct binder_transaction *t; + struct rb_node *n; + int threads, nodes, incoming_refs, outgoing_refs, buffers, + active_transactions, page_count; + + BUG_ON(proc->vma); + BUG_ON(proc->files); + + hlist_del(&proc->proc_node); + + if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) { + binder_debug(BINDER_DEBUG_DEAD_BINDER, + "%s: %d context_mgr_node gone\n", + __func__, proc->pid); + binder_context_mgr_node = NULL; + } + + threads = 0; + active_transactions = 0; + while ((n = rb_first(&proc->threads))) { + struct binder_thread *thread; + + thread = rb_entry(n, struct binder_thread, rb_node); + threads++; + active_transactions += binder_free_thread(proc, thread); + } + + nodes = 0; + incoming_refs = 0; + while ((n = rb_first(&proc->nodes))) { + struct binder_node *node; + + node = rb_entry(n, struct binder_node, rb_node); + nodes++; + rb_erase(&node->rb_node, &proc->nodes); + incoming_refs = binder_node_release(node, incoming_refs); + } + + outgoing_refs = 0; + while ((n = rb_first(&proc->refs_by_desc))) { + struct binder_ref *ref; + + ref = rb_entry(n, struct binder_ref, rb_node_desc); + outgoing_refs++; + binder_delete_ref(ref); + } + + binder_release_work(&proc->todo); + binder_release_work(&proc->delivered_death); + + buffers = 0; + while ((n = rb_first(&proc->allocated_buffers))) { + struct binder_buffer *buffer; + + buffer = rb_entry(n, struct binder_buffer, rb_node); + + t = buffer->transaction; + if (t) { + t->buffer = NULL; + buffer->transaction = NULL; + pr_err("release proc %d, transaction %d, not freed\n", + proc->pid, t->debug_id); + /*BUG();*/ + } + + binder_free_buf(proc, buffer); + buffers++; + } + + binder_stats_deleted(BINDER_STAT_PROC); + + page_count = 0; + if (proc->pages) { + int i; + + for (i = 0; i < proc->buffer_size / PAGE_SIZE; i++) { + void *page_addr; + + if (!proc->pages[i]) + continue; + + page_addr = proc->buffer + i * PAGE_SIZE; + binder_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%s: %d: page %d at %p not freed\n", + __func__, proc->pid, i, page_addr); + unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); + __free_page(proc->pages[i]); + page_count++; + } + kfree(proc->pages); + vfree(proc->buffer); + } + + put_task_struct(proc->tsk); + + binder_debug(BINDER_DEBUG_OPEN_CLOSE, + "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n", + __func__, proc->pid, threads, nodes, incoming_refs, + outgoing_refs, active_transactions, buffers, page_count); + + kfree(proc); +} + +static void binder_deferred_func(struct work_struct *work) +{ + struct binder_proc *proc; + struct files_struct *files; + + int defer; + + do { + binder_lock(__func__); + mutex_lock(&binder_deferred_lock); + if (!hlist_empty(&binder_deferred_list)) { + proc = hlist_entry(binder_deferred_list.first, + struct binder_proc, deferred_work_node); + hlist_del_init(&proc->deferred_work_node); + defer = proc->deferred_work; + proc->deferred_work = 0; + } else { + proc = NULL; + defer = 0; + } + mutex_unlock(&binder_deferred_lock); + + files = NULL; + if (defer & BINDER_DEFERRED_PUT_FILES) { + files = proc->files; + if (files) + proc->files = NULL; + } + + if (defer & BINDER_DEFERRED_FLUSH) + binder_deferred_flush(proc); + + if (defer & BINDER_DEFERRED_RELEASE) + binder_deferred_release(proc); /* frees proc */ + + binder_unlock(__func__); + if (files) + put_files_struct(files); + } while (proc); +} +static DECLARE_WORK(binder_deferred_work, binder_deferred_func); + +static void +binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) +{ + mutex_lock(&binder_deferred_lock); + proc->deferred_work |= defer; + if (hlist_unhashed(&proc->deferred_work_node)) { + hlist_add_head(&proc->deferred_work_node, + &binder_deferred_list); + queue_work(binder_deferred_workqueue, &binder_deferred_work); + } + mutex_unlock(&binder_deferred_lock); +} + +static void print_binder_transaction(struct seq_file *m, const char *prefix, + struct binder_transaction *t) +{ + seq_printf(m, + "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d", + prefix, t->debug_id, t, + t->from ? t->from->proc->pid : 0, + t->from ? t->from->pid : 0, + t->to_proc ? t->to_proc->pid : 0, + t->to_thread ? t->to_thread->pid : 0, + t->code, t->flags, t->priority, t->need_reply); + if (t->buffer == NULL) { + seq_puts(m, " buffer free\n"); + return; + } + if (t->buffer->target_node) + seq_printf(m, " node %d", + t->buffer->target_node->debug_id); + seq_printf(m, " size %zd:%zd data %p\n", + t->buffer->data_size, t->buffer->offsets_size, + t->buffer->data); +} + +static void print_binder_buffer(struct seq_file *m, const char *prefix, + struct binder_buffer *buffer) +{ + seq_printf(m, "%s %d: %p size %zd:%zd %s\n", + prefix, buffer->debug_id, buffer->data, + buffer->data_size, buffer->offsets_size, + buffer->transaction ? "active" : "delivered"); +} + +static void print_binder_work(struct seq_file *m, const char *prefix, + const char *transaction_prefix, + struct binder_work *w) +{ + struct binder_node *node; + struct binder_transaction *t; + + switch (w->type) { + case BINDER_WORK_TRANSACTION: + t = container_of(w, struct binder_transaction, work); + print_binder_transaction(m, transaction_prefix, t); + break; + case BINDER_WORK_TRANSACTION_COMPLETE: + seq_printf(m, "%stransaction complete\n", prefix); + break; + case BINDER_WORK_NODE: + node = container_of(w, struct binder_node, work); + seq_printf(m, "%snode work %d: u%016llx c%016llx\n", + prefix, node->debug_id, + (u64)node->ptr, (u64)node->cookie); + break; + case BINDER_WORK_DEAD_BINDER: + seq_printf(m, "%shas dead binder\n", prefix); + break; + case BINDER_WORK_DEAD_BINDER_AND_CLEAR: + seq_printf(m, "%shas cleared dead binder\n", prefix); + break; + case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: + seq_printf(m, "%shas cleared death notification\n", prefix); + break; + default: + seq_printf(m, "%sunknown work: type %d\n", prefix, w->type); + break; + } +} + +static void print_binder_thread(struct seq_file *m, + struct binder_thread *thread, + int print_always) +{ + struct binder_transaction *t; + struct binder_work *w; + size_t start_pos = m->count; + size_t header_pos; + + seq_printf(m, " thread %d: l %02x\n", thread->pid, thread->looper); + header_pos = m->count; + t = thread->transaction_stack; + while (t) { + if (t->from == thread) { + print_binder_transaction(m, + " outgoing transaction", t); + t = t->from_parent; + } else if (t->to_thread == thread) { + print_binder_transaction(m, + " incoming transaction", t); + t = t->to_parent; + } else { + print_binder_transaction(m, " bad transaction", t); + t = NULL; + } + } + list_for_each_entry(w, &thread->todo, entry) { + print_binder_work(m, " ", " pending transaction", w); + } + if (!print_always && m->count == header_pos) + m->count = start_pos; +} + +static void print_binder_node(struct seq_file *m, struct binder_node *node) +{ + struct binder_ref *ref; + struct binder_work *w; + int count; + + count = 0; + hlist_for_each_entry(ref, &node->refs, node_entry) + count++; + + seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d", + node->debug_id, (u64)node->ptr, (u64)node->cookie, + node->has_strong_ref, node->has_weak_ref, + node->local_strong_refs, node->local_weak_refs, + node->internal_strong_refs, count); + if (count) { + seq_puts(m, " proc"); + hlist_for_each_entry(ref, &node->refs, node_entry) + seq_printf(m, " %d", ref->proc->pid); + } + seq_puts(m, "\n"); + list_for_each_entry(w, &node->async_todo, entry) + print_binder_work(m, " ", + " pending async transaction", w); +} + +static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) +{ + seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %p\n", + ref->debug_id, ref->desc, ref->node->proc ? "" : "dead ", + ref->node->debug_id, ref->strong, ref->weak, ref->death); +} + +static void print_binder_proc(struct seq_file *m, + struct binder_proc *proc, int print_all) +{ + struct binder_work *w; + struct rb_node *n; + size_t start_pos = m->count; + size_t header_pos; + + seq_printf(m, "proc %d\n", proc->pid); + header_pos = m->count; + + for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) + print_binder_thread(m, rb_entry(n, struct binder_thread, + rb_node), print_all); + for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { + struct binder_node *node = rb_entry(n, struct binder_node, + rb_node); + if (print_all || node->has_async_transaction) + print_binder_node(m, node); + } + if (print_all) { + for (n = rb_first(&proc->refs_by_desc); + n != NULL; + n = rb_next(n)) + print_binder_ref(m, rb_entry(n, struct binder_ref, + rb_node_desc)); + } + for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n)) + print_binder_buffer(m, " buffer", + rb_entry(n, struct binder_buffer, rb_node)); + list_for_each_entry(w, &proc->todo, entry) + print_binder_work(m, " ", " pending transaction", w); + list_for_each_entry(w, &proc->delivered_death, entry) { + seq_puts(m, " has delivered dead binder\n"); + break; + } + if (!print_all && m->count == header_pos) + m->count = start_pos; +} + +static const char * const binder_return_strings[] = { + "BR_ERROR", + "BR_OK", + "BR_TRANSACTION", + "BR_REPLY", + "BR_ACQUIRE_RESULT", + "BR_DEAD_REPLY", + "BR_TRANSACTION_COMPLETE", + "BR_INCREFS", + "BR_ACQUIRE", + "BR_RELEASE", + "BR_DECREFS", + "BR_ATTEMPT_ACQUIRE", + "BR_NOOP", + "BR_SPAWN_LOOPER", + "BR_FINISHED", + "BR_DEAD_BINDER", + "BR_CLEAR_DEATH_NOTIFICATION_DONE", + "BR_FAILED_REPLY" +}; + +static const char * const binder_command_strings[] = { + "BC_TRANSACTION", + "BC_REPLY", + "BC_ACQUIRE_RESULT", + "BC_FREE_BUFFER", + "BC_INCREFS", + "BC_ACQUIRE", + "BC_RELEASE", + "BC_DECREFS", + "BC_INCREFS_DONE", + "BC_ACQUIRE_DONE", + "BC_ATTEMPT_ACQUIRE", + "BC_REGISTER_LOOPER", + "BC_ENTER_LOOPER", + "BC_EXIT_LOOPER", + "BC_REQUEST_DEATH_NOTIFICATION", + "BC_CLEAR_DEATH_NOTIFICATION", + "BC_DEAD_BINDER_DONE" +}; + +static const char * const binder_objstat_strings[] = { + "proc", + "thread", + "node", + "ref", + "death", + "transaction", + "transaction_complete" +}; + +static void print_binder_stats(struct seq_file *m, const char *prefix, + struct binder_stats *stats) +{ + int i; + + BUILD_BUG_ON(ARRAY_SIZE(stats->bc) != + ARRAY_SIZE(binder_command_strings)); + for (i = 0; i < ARRAY_SIZE(stats->bc); i++) { + if (stats->bc[i]) + seq_printf(m, "%s%s: %d\n", prefix, + binder_command_strings[i], stats->bc[i]); + } + + BUILD_BUG_ON(ARRAY_SIZE(stats->br) != + ARRAY_SIZE(binder_return_strings)); + for (i = 0; i < ARRAY_SIZE(stats->br); i++) { + if (stats->br[i]) + seq_printf(m, "%s%s: %d\n", prefix, + binder_return_strings[i], stats->br[i]); + } + + BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != + ARRAY_SIZE(binder_objstat_strings)); + BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != + ARRAY_SIZE(stats->obj_deleted)); + for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { + if (stats->obj_created[i] || stats->obj_deleted[i]) + seq_printf(m, "%s%s: active %d total %d\n", prefix, + binder_objstat_strings[i], + stats->obj_created[i] - stats->obj_deleted[i], + stats->obj_created[i]); + } +} + +static void print_binder_proc_stats(struct seq_file *m, + struct binder_proc *proc) +{ + struct binder_work *w; + struct rb_node *n; + int count, strong, weak; + + seq_printf(m, "proc %d\n", proc->pid); + count = 0; + for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) + count++; + seq_printf(m, " threads: %d\n", count); + seq_printf(m, " requested threads: %d+%d/%d\n" + " ready threads %d\n" + " free async space %zd\n", proc->requested_threads, + proc->requested_threads_started, proc->max_threads, + proc->ready_threads, proc->free_async_space); + count = 0; + for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) + count++; + seq_printf(m, " nodes: %d\n", count); + count = 0; + strong = 0; + weak = 0; + for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { + struct binder_ref *ref = rb_entry(n, struct binder_ref, + rb_node_desc); + count++; + strong += ref->strong; + weak += ref->weak; + } + seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); + + count = 0; + for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n)) + count++; + seq_printf(m, " buffers: %d\n", count); + + count = 0; + list_for_each_entry(w, &proc->todo, entry) { + switch (w->type) { + case BINDER_WORK_TRANSACTION: + count++; + break; + default: + break; + } + } + seq_printf(m, " pending transactions: %d\n", count); + + print_binder_stats(m, " ", &proc->stats); +} + + +static int binder_state_show(struct seq_file *m, void *unused) +{ + struct binder_proc *proc; + struct binder_node *node; + int do_lock = !binder_debug_no_lock; + + if (do_lock) + binder_lock(__func__); + + seq_puts(m, "binder state:\n"); + + if (!hlist_empty(&binder_dead_nodes)) + seq_puts(m, "dead nodes:\n"); + hlist_for_each_entry(node, &binder_dead_nodes, dead_node) + print_binder_node(m, node); + + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc(m, proc, 1); + if (do_lock) + binder_unlock(__func__); + return 0; +} + +static int binder_stats_show(struct seq_file *m, void *unused) +{ + struct binder_proc *proc; + int do_lock = !binder_debug_no_lock; + + if (do_lock) + binder_lock(__func__); + + seq_puts(m, "binder stats:\n"); + + print_binder_stats(m, "", &binder_stats); + + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc_stats(m, proc); + if (do_lock) + binder_unlock(__func__); + return 0; +} + +static int binder_transactions_show(struct seq_file *m, void *unused) +{ + struct binder_proc *proc; + int do_lock = !binder_debug_no_lock; + + if (do_lock) + binder_lock(__func__); + + seq_puts(m, "binder transactions:\n"); + hlist_for_each_entry(proc, &binder_procs, proc_node) + print_binder_proc(m, proc, 0); + if (do_lock) + binder_unlock(__func__); + return 0; +} + +static int binder_proc_show(struct seq_file *m, void *unused) +{ + struct binder_proc *proc = m->private; + int do_lock = !binder_debug_no_lock; + + if (do_lock) + binder_lock(__func__); + seq_puts(m, "binder proc state:\n"); + print_binder_proc(m, proc, 1); + if (do_lock) + binder_unlock(__func__); + return 0; +} + +static void print_binder_transaction_log_entry(struct seq_file *m, + struct binder_transaction_log_entry *e) +{ + seq_printf(m, + "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n", + e->debug_id, (e->call_type == 2) ? "reply" : + ((e->call_type == 1) ? "async" : "call "), e->from_proc, + e->from_thread, e->to_proc, e->to_thread, e->to_node, + e->target_handle, e->data_size, e->offsets_size); +} + +static int binder_transaction_log_show(struct seq_file *m, void *unused) +{ + struct binder_transaction_log *log = m->private; + int i; + + if (log->full) { + for (i = log->next; i < ARRAY_SIZE(log->entry); i++) + print_binder_transaction_log_entry(m, &log->entry[i]); + } + for (i = 0; i < log->next; i++) + print_binder_transaction_log_entry(m, &log->entry[i]); + return 0; +} + +static const struct file_operations binder_fops = { + .owner = THIS_MODULE, + .poll = binder_poll, + .unlocked_ioctl = binder_ioctl, + .compat_ioctl = binder_ioctl, + .mmap = binder_mmap, + .open = binder_open, + .flush = binder_flush, + .release = binder_release, +}; + +static struct miscdevice binder_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "binder", + .fops = &binder_fops +}; + +BINDER_DEBUG_ENTRY(state); +BINDER_DEBUG_ENTRY(stats); +BINDER_DEBUG_ENTRY(transactions); +BINDER_DEBUG_ENTRY(transaction_log); + +static int __init binder_init(void) +{ + int ret; + + binder_deferred_workqueue = create_singlethread_workqueue("binder"); + if (!binder_deferred_workqueue) + return -ENOMEM; + + binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); + if (binder_debugfs_dir_entry_root) + binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", + binder_debugfs_dir_entry_root); + ret = misc_register(&binder_miscdev); + if (binder_debugfs_dir_entry_root) { + debugfs_create_file("state", + S_IRUGO, + binder_debugfs_dir_entry_root, + NULL, + &binder_state_fops); + debugfs_create_file("stats", + S_IRUGO, + binder_debugfs_dir_entry_root, + NULL, + &binder_stats_fops); + debugfs_create_file("transactions", + S_IRUGO, + binder_debugfs_dir_entry_root, + NULL, + &binder_transactions_fops); + debugfs_create_file("transaction_log", + S_IRUGO, + binder_debugfs_dir_entry_root, + &binder_transaction_log, + &binder_transaction_log_fops); + debugfs_create_file("failed_transaction_log", + S_IRUGO, + binder_debugfs_dir_entry_root, + &binder_transaction_log_failed, + &binder_transaction_log_fops); + } + return ret; +} + +device_initcall(binder_init); + +#define CREATE_TRACE_POINTS +#include "binder_trace.h" + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/android/binder.h b/drivers/android/binder.h new file mode 100644 index 000000000000..5dc6a66b0665 --- /dev/null +++ b/drivers/android/binder.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2008 Google, Inc. + * + * Based on, but no longer compatible with, the original + * OpenBinder.org binder driver interface, which is: + * + * Copyright (c) 2005 Palmsource, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _LINUX_BINDER_H +#define _LINUX_BINDER_H + +#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT +#define BINDER_IPC_32BIT 1 +#endif + +#include + +#endif /* _LINUX_BINDER_H */ + diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h new file mode 100644 index 000000000000..7f20f3dc8369 --- /dev/null +++ b/drivers/android/binder_trace.h @@ -0,0 +1,329 @@ +/* + * Copyright (C) 2012 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM binder + +#if !defined(_BINDER_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _BINDER_TRACE_H + +#include + +struct binder_buffer; +struct binder_node; +struct binder_proc; +struct binder_ref; +struct binder_thread; +struct binder_transaction; + +TRACE_EVENT(binder_ioctl, + TP_PROTO(unsigned int cmd, unsigned long arg), + TP_ARGS(cmd, arg), + + TP_STRUCT__entry( + __field(unsigned int, cmd) + __field(unsigned long, arg) + ), + TP_fast_assign( + __entry->cmd = cmd; + __entry->arg = arg; + ), + TP_printk("cmd=0x%x arg=0x%lx", __entry->cmd, __entry->arg) +); + +DECLARE_EVENT_CLASS(binder_lock_class, + TP_PROTO(const char *tag), + TP_ARGS(tag), + TP_STRUCT__entry( + __field(const char *, tag) + ), + TP_fast_assign( + __entry->tag = tag; + ), + TP_printk("tag=%s", __entry->tag) +); + +#define DEFINE_BINDER_LOCK_EVENT(name) \ +DEFINE_EVENT(binder_lock_class, name, \ + TP_PROTO(const char *func), \ + TP_ARGS(func)) + +DEFINE_BINDER_LOCK_EVENT(binder_lock); +DEFINE_BINDER_LOCK_EVENT(binder_locked); +DEFINE_BINDER_LOCK_EVENT(binder_unlock); + +DECLARE_EVENT_CLASS(binder_function_return_class, + TP_PROTO(int ret), + TP_ARGS(ret), + TP_STRUCT__entry( + __field(int, ret) + ), + TP_fast_assign( + __entry->ret = ret; + ), + TP_printk("ret=%d", __entry->ret) +); + +#define DEFINE_BINDER_FUNCTION_RETURN_EVENT(name) \ +DEFINE_EVENT(binder_function_return_class, name, \ + TP_PROTO(int ret), \ + TP_ARGS(ret)) + +DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done); +DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done); +DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done); + +TRACE_EVENT(binder_wait_for_work, + TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo), + TP_ARGS(proc_work, transaction_stack, thread_todo), + + TP_STRUCT__entry( + __field(bool, proc_work) + __field(bool, transaction_stack) + __field(bool, thread_todo) + ), + TP_fast_assign( + __entry->proc_work = proc_work; + __entry->transaction_stack = transaction_stack; + __entry->thread_todo = thread_todo; + ), + TP_printk("proc_work=%d transaction_stack=%d thread_todo=%d", + __entry->proc_work, __entry->transaction_stack, + __entry->thread_todo) +); + +TRACE_EVENT(binder_transaction, + TP_PROTO(bool reply, struct binder_transaction *t, + struct binder_node *target_node), + TP_ARGS(reply, t, target_node), + TP_STRUCT__entry( + __field(int, debug_id) + __field(int, target_node) + __field(int, to_proc) + __field(int, to_thread) + __field(int, reply) + __field(unsigned int, code) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->debug_id = t->debug_id; + __entry->target_node = target_node ? target_node->debug_id : 0; + __entry->to_proc = t->to_proc->pid; + __entry->to_thread = t->to_thread ? t->to_thread->pid : 0; + __entry->reply = reply; + __entry->code = t->code; + __entry->flags = t->flags; + ), + TP_printk("transaction=%d dest_node=%d dest_proc=%d dest_thread=%d reply=%d flags=0x%x code=0x%x", + __entry->debug_id, __entry->target_node, + __entry->to_proc, __entry->to_thread, + __entry->reply, __entry->flags, __entry->code) +); + +TRACE_EVENT(binder_transaction_received, + TP_PROTO(struct binder_transaction *t), + TP_ARGS(t), + + TP_STRUCT__entry( + __field(int, debug_id) + ), + TP_fast_assign( + __entry->debug_id = t->debug_id; + ), + TP_printk("transaction=%d", __entry->debug_id) +); + +TRACE_EVENT(binder_transaction_node_to_ref, + TP_PROTO(struct binder_transaction *t, struct binder_node *node, + struct binder_ref *ref), + TP_ARGS(t, node, ref), + + TP_STRUCT__entry( + __field(int, debug_id) + __field(int, node_debug_id) + __field(binder_uintptr_t, node_ptr) + __field(int, ref_debug_id) + __field(uint32_t, ref_desc) + ), + TP_fast_assign( + __entry->debug_id = t->debug_id; + __entry->node_debug_id = node->debug_id; + __entry->node_ptr = node->ptr; + __entry->ref_debug_id = ref->debug_id; + __entry->ref_desc = ref->desc; + ), + TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d", + __entry->debug_id, __entry->node_debug_id, + (u64)__entry->node_ptr, + __entry->ref_debug_id, __entry->ref_desc) +); + +TRACE_EVENT(binder_transaction_ref_to_node, + TP_PROTO(struct binder_transaction *t, struct binder_ref *ref), + TP_ARGS(t, ref), + + TP_STRUCT__entry( + __field(int, debug_id) + __field(int, ref_debug_id) + __field(uint32_t, ref_desc) + __field(int, node_debug_id) + __field(binder_uintptr_t, node_ptr) + ), + TP_fast_assign( + __entry->debug_id = t->debug_id; + __entry->ref_debug_id = ref->debug_id; + __entry->ref_desc = ref->desc; + __entry->node_debug_id = ref->node->debug_id; + __entry->node_ptr = ref->node->ptr; + ), + TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx", + __entry->debug_id, __entry->node_debug_id, + __entry->ref_debug_id, __entry->ref_desc, + (u64)__entry->node_ptr) +); + +TRACE_EVENT(binder_transaction_ref_to_ref, + TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref, + struct binder_ref *dest_ref), + TP_ARGS(t, src_ref, dest_ref), + + TP_STRUCT__entry( + __field(int, debug_id) + __field(int, node_debug_id) + __field(int, src_ref_debug_id) + __field(uint32_t, src_ref_desc) + __field(int, dest_ref_debug_id) + __field(uint32_t, dest_ref_desc) + ), + TP_fast_assign( + __entry->debug_id = t->debug_id; + __entry->node_debug_id = src_ref->node->debug_id; + __entry->src_ref_debug_id = src_ref->debug_id; + __entry->src_ref_desc = src_ref->desc; + __entry->dest_ref_debug_id = dest_ref->debug_id; + __entry->dest_ref_desc = dest_ref->desc; + ), + TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ref=%d dest_desc=%d", + __entry->debug_id, __entry->node_debug_id, + __entry->src_ref_debug_id, __entry->src_ref_desc, + __entry->dest_ref_debug_id, __entry->dest_ref_desc) +); + +TRACE_EVENT(binder_transaction_fd, + TP_PROTO(struct binder_transaction *t, int src_fd, int dest_fd), + TP_ARGS(t, src_fd, dest_fd), + + TP_STRUCT__entry( + __field(int, debug_id) + __field(int, src_fd) + __field(int, dest_fd) + ), + TP_fast_assign( + __entry->debug_id = t->debug_id; + __entry->src_fd = src_fd; + __entry->dest_fd = dest_fd; + ), + TP_printk("transaction=%d src_fd=%d ==> dest_fd=%d", + __entry->debug_id, __entry->src_fd, __entry->dest_fd) +); + +DECLARE_EVENT_CLASS(binder_buffer_class, + TP_PROTO(struct binder_buffer *buf), + TP_ARGS(buf), + TP_STRUCT__entry( + __field(int, debug_id) + __field(size_t, data_size) + __field(size_t, offsets_size) + ), + TP_fast_assign( + __entry->debug_id = buf->debug_id; + __entry->data_size = buf->data_size; + __entry->offsets_size = buf->offsets_size; + ), + TP_printk("transaction=%d data_size=%zd offsets_size=%zd", + __entry->debug_id, __entry->data_size, __entry->offsets_size) +); + +DEFINE_EVENT(binder_buffer_class, binder_transaction_alloc_buf, + TP_PROTO(struct binder_buffer *buffer), + TP_ARGS(buffer)); + +DEFINE_EVENT(binder_buffer_class, binder_transaction_buffer_release, + TP_PROTO(struct binder_buffer *buffer), + TP_ARGS(buffer)); + +DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, + TP_PROTO(struct binder_buffer *buffer), + TP_ARGS(buffer)); + +TRACE_EVENT(binder_update_page_range, + TP_PROTO(struct binder_proc *proc, bool allocate, + void *start, void *end), + TP_ARGS(proc, allocate, start, end), + TP_STRUCT__entry( + __field(int, proc) + __field(bool, allocate) + __field(size_t, offset) + __field(size_t, size) + ), + TP_fast_assign( + __entry->proc = proc->pid; + __entry->allocate = allocate; + __entry->offset = start - proc->buffer; + __entry->size = end - start; + ), + TP_printk("proc=%d allocate=%d offset=%zu size=%zu", + __entry->proc, __entry->allocate, + __entry->offset, __entry->size) +); + +TRACE_EVENT(binder_command, + TP_PROTO(uint32_t cmd), + TP_ARGS(cmd), + TP_STRUCT__entry( + __field(uint32_t, cmd) + ), + TP_fast_assign( + __entry->cmd = cmd; + ), + TP_printk("cmd=0x%x %s", + __entry->cmd, + _IOC_NR(__entry->cmd) < ARRAY_SIZE(binder_command_strings) ? + binder_command_strings[_IOC_NR(__entry->cmd)] : + "unknown") +); + +TRACE_EVENT(binder_return, + TP_PROTO(uint32_t cmd), + TP_ARGS(cmd), + TP_STRUCT__entry( + __field(uint32_t, cmd) + ), + TP_fast_assign( + __entry->cmd = cmd; + ), + TP_printk("cmd=0x%x %s", + __entry->cmd, + _IOC_NR(__entry->cmd) < ARRAY_SIZE(binder_return_strings) ? + binder_return_strings[_IOC_NR(__entry->cmd)] : + "unknown") +); + +#endif /* _BINDER_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE binder_trace +#include diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig index 7a0e28852965..7e012f37792b 100644 --- a/drivers/staging/android/Kconfig +++ b/drivers/staging/android/Kconfig @@ -1,37 +1,7 @@ menu "Android" -config ANDROID - bool "Android Drivers" - ---help--- - Enable support for various drivers needed on the Android platform - if ANDROID -config ANDROID_BINDER_IPC - bool "Android Binder IPC Driver" - depends on MMU - default n - ---help--- - Binder is used in Android for both communication between processes, - and remote method invocation. - - This means one Android process can call a method/routine in another - Android process, using Binder to identify, invoke and pass arguments - between said processes. - -config ANDROID_BINDER_IPC_32BIT - bool - depends on !64BIT && ANDROID_BINDER_IPC - default y - ---help--- - The Binder API has been changed to support both 32 and 64bit - applications in a mixed environment. - - Enable this to support an old 32-bit Android user-space (v4.4 and - earlier). - - Note that enabling this will break newer Android user-space. - config ASHMEM bool "Enable the Anonymous Shared Memory Subsystem" default n diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile index 517ad5ffa429..479b2b86f8c8 100644 --- a/drivers/staging/android/Makefile +++ b/drivers/staging/android/Makefile @@ -2,7 +2,6 @@ ccflags-y += -I$(src) # needed for trace events obj-y += ion/ -obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o obj-$(CONFIG_ASHMEM) += ashmem.o obj-$(CONFIG_ANDROID_LOGGER) += logger.o obj-$(CONFIG_ANDROID_TIMED_OUTPUT) += timed_output.o diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c deleted file mode 100644 index c69c40d69d5c..000000000000 --- a/drivers/staging/android/binder.c +++ /dev/null @@ -1,3673 +0,0 @@ -/* binder.c - * - * Android IPC Subsystem - * - * Copyright (C) 2007-2008 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "binder.h" -#include "binder_trace.h" - -static DEFINE_MUTEX(binder_main_lock); -static DEFINE_MUTEX(binder_deferred_lock); -static DEFINE_MUTEX(binder_mmap_lock); - -static HLIST_HEAD(binder_procs); -static HLIST_HEAD(binder_deferred_list); -static HLIST_HEAD(binder_dead_nodes); - -static struct dentry *binder_debugfs_dir_entry_root; -static struct dentry *binder_debugfs_dir_entry_proc; -static struct binder_node *binder_context_mgr_node; -static kuid_t binder_context_mgr_uid = INVALID_UID; -static int binder_last_id; -static struct workqueue_struct *binder_deferred_workqueue; - -#define BINDER_DEBUG_ENTRY(name) \ -static int binder_##name##_open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, binder_##name##_show, inode->i_private); \ -} \ -\ -static const struct file_operations binder_##name##_fops = { \ - .owner = THIS_MODULE, \ - .open = binder_##name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -} - -static int binder_proc_show(struct seq_file *m, void *unused); -BINDER_DEBUG_ENTRY(proc); - -/* This is only defined in include/asm-arm/sizes.h */ -#ifndef SZ_1K -#define SZ_1K 0x400 -#endif - -#ifndef SZ_4M -#define SZ_4M 0x400000 -#endif - -#define FORBIDDEN_MMAP_FLAGS (VM_WRITE) - -#define BINDER_SMALL_BUF_SIZE (PAGE_SIZE * 64) - -enum { - BINDER_DEBUG_USER_ERROR = 1U << 0, - BINDER_DEBUG_FAILED_TRANSACTION = 1U << 1, - BINDER_DEBUG_DEAD_TRANSACTION = 1U << 2, - BINDER_DEBUG_OPEN_CLOSE = 1U << 3, - BINDER_DEBUG_DEAD_BINDER = 1U << 4, - BINDER_DEBUG_DEATH_NOTIFICATION = 1U << 5, - BINDER_DEBUG_READ_WRITE = 1U << 6, - BINDER_DEBUG_USER_REFS = 1U << 7, - BINDER_DEBUG_THREADS = 1U << 8, - BINDER_DEBUG_TRANSACTION = 1U << 9, - BINDER_DEBUG_TRANSACTION_COMPLETE = 1U << 10, - BINDER_DEBUG_FREE_BUFFER = 1U << 11, - BINDER_DEBUG_INTERNAL_REFS = 1U << 12, - BINDER_DEBUG_BUFFER_ALLOC = 1U << 13, - BINDER_DEBUG_PRIORITY_CAP = 1U << 14, - BINDER_DEBUG_BUFFER_ALLOC_ASYNC = 1U << 15, -}; -static uint32_t binder_debug_mask = BINDER_DEBUG_USER_ERROR | - BINDER_DEBUG_FAILED_TRANSACTION | BINDER_DEBUG_DEAD_TRANSACTION; -module_param_named(debug_mask, binder_debug_mask, uint, S_IWUSR | S_IRUGO); - -static bool binder_debug_no_lock; -module_param_named(proc_no_lock, binder_debug_no_lock, bool, S_IWUSR | S_IRUGO); - -static DECLARE_WAIT_QUEUE_HEAD(binder_user_error_wait); -static int binder_stop_on_user_error; - -static int binder_set_stop_on_user_error(const char *val, - struct kernel_param *kp) -{ - int ret; - - ret = param_set_int(val, kp); - if (binder_stop_on_user_error < 2) - wake_up(&binder_user_error_wait); - return ret; -} -module_param_call(stop_on_user_error, binder_set_stop_on_user_error, - param_get_int, &binder_stop_on_user_error, S_IWUSR | S_IRUGO); - -#define binder_debug(mask, x...) \ - do { \ - if (binder_debug_mask & mask) \ - pr_info(x); \ - } while (0) - -#define binder_user_error(x...) \ - do { \ - if (binder_debug_mask & BINDER_DEBUG_USER_ERROR) \ - pr_info(x); \ - if (binder_stop_on_user_error) \ - binder_stop_on_user_error = 2; \ - } while (0) - -enum binder_stat_types { - BINDER_STAT_PROC, - BINDER_STAT_THREAD, - BINDER_STAT_NODE, - BINDER_STAT_REF, - BINDER_STAT_DEATH, - BINDER_STAT_TRANSACTION, - BINDER_STAT_TRANSACTION_COMPLETE, - BINDER_STAT_COUNT -}; - -struct binder_stats { - int br[_IOC_NR(BR_FAILED_REPLY) + 1]; - int bc[_IOC_NR(BC_DEAD_BINDER_DONE) + 1]; - int obj_created[BINDER_STAT_COUNT]; - int obj_deleted[BINDER_STAT_COUNT]; -}; - -static struct binder_stats binder_stats; - -static inline void binder_stats_deleted(enum binder_stat_types type) -{ - binder_stats.obj_deleted[type]++; -} - -static inline void binder_stats_created(enum binder_stat_types type) -{ - binder_stats.obj_created[type]++; -} - -struct binder_transaction_log_entry { - int debug_id; - int call_type; - int from_proc; - int from_thread; - int target_handle; - int to_proc; - int to_thread; - int to_node; - int data_size; - int offsets_size; -}; -struct binder_transaction_log { - int next; - int full; - struct binder_transaction_log_entry entry[32]; -}; -static struct binder_transaction_log binder_transaction_log; -static struct binder_transaction_log binder_transaction_log_failed; - -static struct binder_transaction_log_entry *binder_transaction_log_add( - struct binder_transaction_log *log) -{ - struct binder_transaction_log_entry *e; - - e = &log->entry[log->next]; - memset(e, 0, sizeof(*e)); - log->next++; - if (log->next == ARRAY_SIZE(log->entry)) { - log->next = 0; - log->full = 1; - } - return e; -} - -struct binder_work { - struct list_head entry; - enum { - BINDER_WORK_TRANSACTION = 1, - BINDER_WORK_TRANSACTION_COMPLETE, - BINDER_WORK_NODE, - BINDER_WORK_DEAD_BINDER, - BINDER_WORK_DEAD_BINDER_AND_CLEAR, - BINDER_WORK_CLEAR_DEATH_NOTIFICATION, - } type; -}; - -struct binder_node { - int debug_id; - struct binder_work work; - union { - struct rb_node rb_node; - struct hlist_node dead_node; - }; - struct binder_proc *proc; - struct hlist_head refs; - int internal_strong_refs; - int local_weak_refs; - int local_strong_refs; - binder_uintptr_t ptr; - binder_uintptr_t cookie; - unsigned has_strong_ref:1; - unsigned pending_strong_ref:1; - unsigned has_weak_ref:1; - unsigned pending_weak_ref:1; - unsigned has_async_transaction:1; - unsigned accept_fds:1; - unsigned min_priority:8; - struct list_head async_todo; -}; - -struct binder_ref_death { - struct binder_work work; - binder_uintptr_t cookie; -}; - -struct binder_ref { - /* Lookups needed: */ - /* node + proc => ref (transaction) */ - /* desc + proc => ref (transaction, inc/dec ref) */ - /* node => refs + procs (proc exit) */ - int debug_id; - struct rb_node rb_node_desc; - struct rb_node rb_node_node; - struct hlist_node node_entry; - struct binder_proc *proc; - struct binder_node *node; - uint32_t desc; - int strong; - int weak; - struct binder_ref_death *death; -}; - -struct binder_buffer { - struct list_head entry; /* free and allocated entries by address */ - struct rb_node rb_node; /* free entry by size or allocated entry */ - /* by address */ - unsigned free:1; - unsigned allow_user_free:1; - unsigned async_transaction:1; - unsigned debug_id:29; - - struct binder_transaction *transaction; - - struct binder_node *target_node; - size_t data_size; - size_t offsets_size; - uint8_t data[0]; -}; - -enum binder_deferred_state { - BINDER_DEFERRED_PUT_FILES = 0x01, - BINDER_DEFERRED_FLUSH = 0x02, - BINDER_DEFERRED_RELEASE = 0x04, -}; - -struct binder_proc { - struct hlist_node proc_node; - struct rb_root threads; - struct rb_root nodes; - struct rb_root refs_by_desc; - struct rb_root refs_by_node; - int pid; - struct vm_area_struct *vma; - struct mm_struct *vma_vm_mm; - struct task_struct *tsk; - struct files_struct *files; - struct hlist_node deferred_work_node; - int deferred_work; - void *buffer; - ptrdiff_t user_buffer_offset; - - struct list_head buffers; - struct rb_root free_buffers; - struct rb_root allocated_buffers; - size_t free_async_space; - - struct page **pages; - size_t buffer_size; - uint32_t buffer_free; - struct list_head todo; - wait_queue_head_t wait; - struct binder_stats stats; - struct list_head delivered_death; - int max_threads; - int requested_threads; - int requested_threads_started; - int ready_threads; - long default_priority; - struct dentry *debugfs_entry; -}; - -enum { - BINDER_LOOPER_STATE_REGISTERED = 0x01, - BINDER_LOOPER_STATE_ENTERED = 0x02, - BINDER_LOOPER_STATE_EXITED = 0x04, - BINDER_LOOPER_STATE_INVALID = 0x08, - BINDER_LOOPER_STATE_WAITING = 0x10, - BINDER_LOOPER_STATE_NEED_RETURN = 0x20 -}; - -struct binder_thread { - struct binder_proc *proc; - struct rb_node rb_node; - int pid; - int looper; - struct binder_transaction *transaction_stack; - struct list_head todo; - uint32_t return_error; /* Write failed, return error code in read buf */ - uint32_t return_error2; /* Write failed, return error code in read */ - /* buffer. Used when sending a reply to a dead process that */ - /* we are also waiting on */ - wait_queue_head_t wait; - struct binder_stats stats; -}; - -struct binder_transaction { - int debug_id; - struct binder_work work; - struct binder_thread *from; - struct binder_transaction *from_parent; - struct binder_proc *to_proc; - struct binder_thread *to_thread; - struct binder_transaction *to_parent; - unsigned need_reply:1; - /* unsigned is_dead:1; */ /* not used at the moment */ - - struct binder_buffer *buffer; - unsigned int code; - unsigned int flags; - long priority; - long saved_priority; - kuid_t sender_euid; -}; - -static void -binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer); - -static int task_get_unused_fd_flags(struct binder_proc *proc, int flags) -{ - struct files_struct *files = proc->files; - unsigned long rlim_cur; - unsigned long irqs; - - if (files == NULL) - return -ESRCH; - - if (!lock_task_sighand(proc->tsk, &irqs)) - return -EMFILE; - - rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE); - unlock_task_sighand(proc->tsk, &irqs); - - return __alloc_fd(files, 0, rlim_cur, flags); -} - -/* - * copied from fd_install - */ -static void task_fd_install( - struct binder_proc *proc, unsigned int fd, struct file *file) -{ - if (proc->files) - __fd_install(proc->files, fd, file); -} - -/* - * copied from sys_close - */ -static long task_close_fd(struct binder_proc *proc, unsigned int fd) -{ - int retval; - - if (proc->files == NULL) - return -ESRCH; - - retval = __close_fd(proc->files, fd); - /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) - retval = -EINTR; - - return retval; -} - -static inline void binder_lock(const char *tag) -{ - trace_binder_lock(tag); - mutex_lock(&binder_main_lock); - trace_binder_locked(tag); -} - -static inline void binder_unlock(const char *tag) -{ - trace_binder_unlock(tag); - mutex_unlock(&binder_main_lock); -} - -static void binder_set_nice(long nice) -{ - long min_nice; - - if (can_nice(current, nice)) { - set_user_nice(current, nice); - return; - } - min_nice = rlimit_to_nice(current->signal->rlim[RLIMIT_NICE].rlim_cur); - binder_debug(BINDER_DEBUG_PRIORITY_CAP, - "%d: nice value %ld not allowed use %ld instead\n", - current->pid, nice, min_nice); - set_user_nice(current, min_nice); - if (min_nice <= MAX_NICE) - return; - binder_user_error("%d RLIMIT_NICE not set\n", current->pid); -} - -static size_t binder_buffer_size(struct binder_proc *proc, - struct binder_buffer *buffer) -{ - if (list_is_last(&buffer->entry, &proc->buffers)) - return proc->buffer + proc->buffer_size - (void *)buffer->data; - return (size_t)list_entry(buffer->entry.next, - struct binder_buffer, entry) - (size_t)buffer->data; -} - -static void binder_insert_free_buffer(struct binder_proc *proc, - struct binder_buffer *new_buffer) -{ - struct rb_node **p = &proc->free_buffers.rb_node; - struct rb_node *parent = NULL; - struct binder_buffer *buffer; - size_t buffer_size; - size_t new_buffer_size; - - BUG_ON(!new_buffer->free); - - new_buffer_size = binder_buffer_size(proc, new_buffer); - - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: add free buffer, size %zd, at %p\n", - proc->pid, new_buffer_size, new_buffer); - - while (*p) { - parent = *p; - buffer = rb_entry(parent, struct binder_buffer, rb_node); - BUG_ON(!buffer->free); - - buffer_size = binder_buffer_size(proc, buffer); - - if (new_buffer_size < buffer_size) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &proc->free_buffers); -} - -static void binder_insert_allocated_buffer(struct binder_proc *proc, - struct binder_buffer *new_buffer) -{ - struct rb_node **p = &proc->allocated_buffers.rb_node; - struct rb_node *parent = NULL; - struct binder_buffer *buffer; - - BUG_ON(new_buffer->free); - - while (*p) { - parent = *p; - buffer = rb_entry(parent, struct binder_buffer, rb_node); - BUG_ON(buffer->free); - - if (new_buffer < buffer) - p = &parent->rb_left; - else if (new_buffer > buffer) - p = &parent->rb_right; - else - BUG(); - } - rb_link_node(&new_buffer->rb_node, parent, p); - rb_insert_color(&new_buffer->rb_node, &proc->allocated_buffers); -} - -static struct binder_buffer *binder_buffer_lookup(struct binder_proc *proc, - uintptr_t user_ptr) -{ - struct rb_node *n = proc->allocated_buffers.rb_node; - struct binder_buffer *buffer; - struct binder_buffer *kern_ptr; - - kern_ptr = (struct binder_buffer *)(user_ptr - proc->user_buffer_offset - - offsetof(struct binder_buffer, data)); - - while (n) { - buffer = rb_entry(n, struct binder_buffer, rb_node); - BUG_ON(buffer->free); - - if (kern_ptr < buffer) - n = n->rb_left; - else if (kern_ptr > buffer) - n = n->rb_right; - else - return buffer; - } - return NULL; -} - -static int binder_update_page_range(struct binder_proc *proc, int allocate, - void *start, void *end, - struct vm_area_struct *vma) -{ - void *page_addr; - unsigned long user_page_addr; - struct vm_struct tmp_area; - struct page **page; - struct mm_struct *mm; - - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: %s pages %p-%p\n", proc->pid, - allocate ? "allocate" : "free", start, end); - - if (end <= start) - return 0; - - trace_binder_update_page_range(proc, allocate, start, end); - - if (vma) - mm = NULL; - else - mm = get_task_mm(proc->tsk); - - if (mm) { - down_write(&mm->mmap_sem); - vma = proc->vma; - if (vma && mm != proc->vma_vm_mm) { - pr_err("%d: vma mm and task mm mismatch\n", - proc->pid); - vma = NULL; - } - } - - if (allocate == 0) - goto free_range; - - if (vma == NULL) { - pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", - proc->pid); - goto err_no_vma; - } - - for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { - int ret; - - page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; - - BUG_ON(*page); - *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); - if (*page == NULL) { - pr_err("%d: binder_alloc_buf failed for page at %p\n", - proc->pid, page_addr); - goto err_alloc_page_failed; - } - tmp_area.addr = page_addr; - tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; - ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); - if (ret) { - pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", - proc->pid, page_addr); - goto err_map_kernel_failed; - } - user_page_addr = - (uintptr_t)page_addr + proc->user_buffer_offset; - ret = vm_insert_page(vma, user_page_addr, page[0]); - if (ret) { - pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", - proc->pid, user_page_addr); - goto err_vm_insert_page_failed; - } - /* vm_insert_page does not seem to increment the refcount */ - } - if (mm) { - up_write(&mm->mmap_sem); - mmput(mm); - } - return 0; - -free_range: - for (page_addr = end - PAGE_SIZE; page_addr >= start; - page_addr -= PAGE_SIZE) { - page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; - if (vma) - zap_page_range(vma, (uintptr_t)page_addr + - proc->user_buffer_offset, PAGE_SIZE, NULL); -err_vm_insert_page_failed: - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); -err_map_kernel_failed: - __free_page(*page); - *page = NULL; -err_alloc_page_failed: - ; - } -err_no_vma: - if (mm) { - up_write(&mm->mmap_sem); - mmput(mm); - } - return -ENOMEM; -} - -static struct binder_buffer *binder_alloc_buf(struct binder_proc *proc, - size_t data_size, - size_t offsets_size, int is_async) -{ - struct rb_node *n = proc->free_buffers.rb_node; - struct binder_buffer *buffer; - size_t buffer_size; - struct rb_node *best_fit = NULL; - void *has_page_addr; - void *end_page_addr; - size_t size; - - if (proc->vma == NULL) { - pr_err("%d: binder_alloc_buf, no vma\n", - proc->pid); - return NULL; - } - - size = ALIGN(data_size, sizeof(void *)) + - ALIGN(offsets_size, sizeof(void *)); - - if (size < data_size || size < offsets_size) { - binder_user_error("%d: got transaction with invalid size %zd-%zd\n", - proc->pid, data_size, offsets_size); - return NULL; - } - - if (is_async && - proc->free_async_space < size + sizeof(struct binder_buffer)) { - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd failed, no async space left\n", - proc->pid, size); - return NULL; - } - - while (n) { - buffer = rb_entry(n, struct binder_buffer, rb_node); - BUG_ON(!buffer->free); - buffer_size = binder_buffer_size(proc, buffer); - - if (size < buffer_size) { - best_fit = n; - n = n->rb_left; - } else if (size > buffer_size) - n = n->rb_right; - else { - best_fit = n; - break; - } - } - if (best_fit == NULL) { - pr_err("%d: binder_alloc_buf size %zd failed, no address space\n", - proc->pid, size); - return NULL; - } - if (n == NULL) { - buffer = rb_entry(best_fit, struct binder_buffer, rb_node); - buffer_size = binder_buffer_size(proc, buffer); - } - - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd got buffer %p size %zd\n", - proc->pid, size, buffer, buffer_size); - - has_page_addr = - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); - if (n == NULL) { - if (size + sizeof(struct binder_buffer) + 4 >= buffer_size) - buffer_size = size; /* no room for other buffers */ - else - buffer_size = size + sizeof(struct binder_buffer); - } - end_page_addr = - (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); - if (end_page_addr > has_page_addr) - end_page_addr = has_page_addr; - if (binder_update_page_range(proc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL)) - return NULL; - - rb_erase(best_fit, &proc->free_buffers); - buffer->free = 0; - binder_insert_allocated_buffer(proc, buffer); - if (buffer_size != size) { - struct binder_buffer *new_buffer = (void *)buffer->data + size; - - list_add(&new_buffer->entry, &buffer->entry); - new_buffer->free = 1; - binder_insert_free_buffer(proc, new_buffer); - } - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_alloc_buf size %zd got %p\n", - proc->pid, size, buffer); - buffer->data_size = data_size; - buffer->offsets_size = offsets_size; - buffer->async_transaction = is_async; - if (is_async) { - proc->free_async_space -= size + sizeof(struct binder_buffer); - binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, - "%d: binder_alloc_buf size %zd async free %zd\n", - proc->pid, size, proc->free_async_space); - } - - return buffer; -} - -static void *buffer_start_page(struct binder_buffer *buffer) -{ - return (void *)((uintptr_t)buffer & PAGE_MASK); -} - -static void *buffer_end_page(struct binder_buffer *buffer) -{ - return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); -} - -static void binder_delete_free_buffer(struct binder_proc *proc, - struct binder_buffer *buffer) -{ - struct binder_buffer *prev, *next = NULL; - int free_page_end = 1; - int free_page_start = 1; - - BUG_ON(proc->buffers.next == &buffer->entry); - prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); - BUG_ON(!prev->free); - if (buffer_end_page(prev) == buffer_start_page(buffer)) { - free_page_start = 0; - if (buffer_end_page(prev) == buffer_end_page(buffer)) - free_page_end = 0; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %p share page with %p\n", - proc->pid, buffer, prev); - } - - if (!list_is_last(&buffer->entry, &proc->buffers)) { - next = list_entry(buffer->entry.next, - struct binder_buffer, entry); - if (buffer_start_page(next) == buffer_end_page(buffer)) { - free_page_end = 0; - if (buffer_start_page(next) == - buffer_start_page(buffer)) - free_page_start = 0; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %p share page with %p\n", - proc->pid, buffer, prev); - } - } - list_del(&buffer->entry); - if (free_page_start || free_page_end) { - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %p do not share page%s%s with %p or %p\n", - proc->pid, buffer, free_page_start ? "" : " end", - free_page_end ? "" : " start", prev, next); - binder_update_page_range(proc, 0, free_page_start ? - buffer_start_page(buffer) : buffer_end_page(buffer), - (free_page_end ? buffer_end_page(buffer) : - buffer_start_page(buffer)) + PAGE_SIZE, NULL); - } -} - -static void binder_free_buf(struct binder_proc *proc, - struct binder_buffer *buffer) -{ - size_t size, buffer_size; - - buffer_size = binder_buffer_size(proc, buffer); - - size = ALIGN(buffer->data_size, sizeof(void *)) + - ALIGN(buffer->offsets_size, sizeof(void *)); - - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: binder_free_buf %p size %zd buffer_size %zd\n", - proc->pid, buffer, size, buffer_size); - - BUG_ON(buffer->free); - BUG_ON(size > buffer_size); - BUG_ON(buffer->transaction != NULL); - BUG_ON((void *)buffer < proc->buffer); - BUG_ON((void *)buffer > proc->buffer + proc->buffer_size); - - if (buffer->async_transaction) { - proc->free_async_space += size + sizeof(struct binder_buffer); - - binder_debug(BINDER_DEBUG_BUFFER_ALLOC_ASYNC, - "%d: binder_free_buf size %zd async free %zd\n", - proc->pid, size, proc->free_async_space); - } - - binder_update_page_range(proc, 0, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), - NULL); - rb_erase(&buffer->rb_node, &proc->allocated_buffers); - buffer->free = 1; - if (!list_is_last(&buffer->entry, &proc->buffers)) { - struct binder_buffer *next = list_entry(buffer->entry.next, - struct binder_buffer, entry); - - if (next->free) { - rb_erase(&next->rb_node, &proc->free_buffers); - binder_delete_free_buffer(proc, next); - } - } - if (proc->buffers.next != &buffer->entry) { - struct binder_buffer *prev = list_entry(buffer->entry.prev, - struct binder_buffer, entry); - - if (prev->free) { - binder_delete_free_buffer(proc, buffer); - rb_erase(&prev->rb_node, &proc->free_buffers); - buffer = prev; - } - } - binder_insert_free_buffer(proc, buffer); -} - -static struct binder_node *binder_get_node(struct binder_proc *proc, - binder_uintptr_t ptr) -{ - struct rb_node *n = proc->nodes.rb_node; - struct binder_node *node; - - while (n) { - node = rb_entry(n, struct binder_node, rb_node); - - if (ptr < node->ptr) - n = n->rb_left; - else if (ptr > node->ptr) - n = n->rb_right; - else - return node; - } - return NULL; -} - -static struct binder_node *binder_new_node(struct binder_proc *proc, - binder_uintptr_t ptr, - binder_uintptr_t cookie) -{ - struct rb_node **p = &proc->nodes.rb_node; - struct rb_node *parent = NULL; - struct binder_node *node; - - while (*p) { - parent = *p; - node = rb_entry(parent, struct binder_node, rb_node); - - if (ptr < node->ptr) - p = &(*p)->rb_left; - else if (ptr > node->ptr) - p = &(*p)->rb_right; - else - return NULL; - } - - node = kzalloc(sizeof(*node), GFP_KERNEL); - if (node == NULL) - return NULL; - binder_stats_created(BINDER_STAT_NODE); - rb_link_node(&node->rb_node, parent, p); - rb_insert_color(&node->rb_node, &proc->nodes); - node->debug_id = ++binder_last_id; - node->proc = proc; - node->ptr = ptr; - node->cookie = cookie; - node->work.type = BINDER_WORK_NODE; - INIT_LIST_HEAD(&node->work.entry); - INIT_LIST_HEAD(&node->async_todo); - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d:%d node %d u%016llx c%016llx created\n", - proc->pid, current->pid, node->debug_id, - (u64)node->ptr, (u64)node->cookie); - return node; -} - -static int binder_inc_node(struct binder_node *node, int strong, int internal, - struct list_head *target_list) -{ - if (strong) { - if (internal) { - if (target_list == NULL && - node->internal_strong_refs == 0 && - !(node == binder_context_mgr_node && - node->has_strong_ref)) { - pr_err("invalid inc strong node for %d\n", - node->debug_id); - return -EINVAL; - } - node->internal_strong_refs++; - } else - node->local_strong_refs++; - if (!node->has_strong_ref && target_list) { - list_del_init(&node->work.entry); - list_add_tail(&node->work.entry, target_list); - } - } else { - if (!internal) - node->local_weak_refs++; - if (!node->has_weak_ref && list_empty(&node->work.entry)) { - if (target_list == NULL) { - pr_err("invalid inc weak node for %d\n", - node->debug_id); - return -EINVAL; - } - list_add_tail(&node->work.entry, target_list); - } - } - return 0; -} - -static int binder_dec_node(struct binder_node *node, int strong, int internal) -{ - if (strong) { - if (internal) - node->internal_strong_refs--; - else - node->local_strong_refs--; - if (node->local_strong_refs || node->internal_strong_refs) - return 0; - } else { - if (!internal) - node->local_weak_refs--; - if (node->local_weak_refs || !hlist_empty(&node->refs)) - return 0; - } - if (node->proc && (node->has_strong_ref || node->has_weak_ref)) { - if (list_empty(&node->work.entry)) { - list_add_tail(&node->work.entry, &node->proc->todo); - wake_up_interruptible(&node->proc->wait); - } - } else { - if (hlist_empty(&node->refs) && !node->local_strong_refs && - !node->local_weak_refs) { - list_del_init(&node->work.entry); - if (node->proc) { - rb_erase(&node->rb_node, &node->proc->nodes); - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "refless node %d deleted\n", - node->debug_id); - } else { - hlist_del(&node->dead_node); - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "dead node %d deleted\n", - node->debug_id); - } - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); - } - } - - return 0; -} - - -static struct binder_ref *binder_get_ref(struct binder_proc *proc, - uint32_t desc) -{ - struct rb_node *n = proc->refs_by_desc.rb_node; - struct binder_ref *ref; - - while (n) { - ref = rb_entry(n, struct binder_ref, rb_node_desc); - - if (desc < ref->desc) - n = n->rb_left; - else if (desc > ref->desc) - n = n->rb_right; - else - return ref; - } - return NULL; -} - -static struct binder_ref *binder_get_ref_for_node(struct binder_proc *proc, - struct binder_node *node) -{ - struct rb_node *n; - struct rb_node **p = &proc->refs_by_node.rb_node; - struct rb_node *parent = NULL; - struct binder_ref *ref, *new_ref; - - while (*p) { - parent = *p; - ref = rb_entry(parent, struct binder_ref, rb_node_node); - - if (node < ref->node) - p = &(*p)->rb_left; - else if (node > ref->node) - p = &(*p)->rb_right; - else - return ref; - } - new_ref = kzalloc(sizeof(*ref), GFP_KERNEL); - if (new_ref == NULL) - return NULL; - binder_stats_created(BINDER_STAT_REF); - new_ref->debug_id = ++binder_last_id; - new_ref->proc = proc; - new_ref->node = node; - rb_link_node(&new_ref->rb_node_node, parent, p); - rb_insert_color(&new_ref->rb_node_node, &proc->refs_by_node); - - new_ref->desc = (node == binder_context_mgr_node) ? 0 : 1; - for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { - ref = rb_entry(n, struct binder_ref, rb_node_desc); - if (ref->desc > new_ref->desc) - break; - new_ref->desc = ref->desc + 1; - } - - p = &proc->refs_by_desc.rb_node; - while (*p) { - parent = *p; - ref = rb_entry(parent, struct binder_ref, rb_node_desc); - - if (new_ref->desc < ref->desc) - p = &(*p)->rb_left; - else if (new_ref->desc > ref->desc) - p = &(*p)->rb_right; - else - BUG(); - } - rb_link_node(&new_ref->rb_node_desc, parent, p); - rb_insert_color(&new_ref->rb_node_desc, &proc->refs_by_desc); - if (node) { - hlist_add_head(&new_ref->node_entry, &node->refs); - - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d new ref %d desc %d for node %d\n", - proc->pid, new_ref->debug_id, new_ref->desc, - node->debug_id); - } else { - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d new ref %d desc %d for dead node\n", - proc->pid, new_ref->debug_id, new_ref->desc); - } - return new_ref; -} - -static void binder_delete_ref(struct binder_ref *ref) -{ - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d delete ref %d desc %d for node %d\n", - ref->proc->pid, ref->debug_id, ref->desc, - ref->node->debug_id); - - rb_erase(&ref->rb_node_desc, &ref->proc->refs_by_desc); - rb_erase(&ref->rb_node_node, &ref->proc->refs_by_node); - if (ref->strong) - binder_dec_node(ref->node, 1, 1); - hlist_del(&ref->node_entry); - binder_dec_node(ref->node, 0, 1); - if (ref->death) { - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "%d delete ref %d desc %d has death notification\n", - ref->proc->pid, ref->debug_id, ref->desc); - list_del(&ref->death->work.entry); - kfree(ref->death); - binder_stats_deleted(BINDER_STAT_DEATH); - } - kfree(ref); - binder_stats_deleted(BINDER_STAT_REF); -} - -static int binder_inc_ref(struct binder_ref *ref, int strong, - struct list_head *target_list) -{ - int ret; - - if (strong) { - if (ref->strong == 0) { - ret = binder_inc_node(ref->node, 1, 1, target_list); - if (ret) - return ret; - } - ref->strong++; - } else { - if (ref->weak == 0) { - ret = binder_inc_node(ref->node, 0, 1, target_list); - if (ret) - return ret; - } - ref->weak++; - } - return 0; -} - - -static int binder_dec_ref(struct binder_ref *ref, int strong) -{ - if (strong) { - if (ref->strong == 0) { - binder_user_error("%d invalid dec strong, ref %d desc %d s %d w %d\n", - ref->proc->pid, ref->debug_id, - ref->desc, ref->strong, ref->weak); - return -EINVAL; - } - ref->strong--; - if (ref->strong == 0) { - int ret; - - ret = binder_dec_node(ref->node, strong, 1); - if (ret) - return ret; - } - } else { - if (ref->weak == 0) { - binder_user_error("%d invalid dec weak, ref %d desc %d s %d w %d\n", - ref->proc->pid, ref->debug_id, - ref->desc, ref->strong, ref->weak); - return -EINVAL; - } - ref->weak--; - } - if (ref->strong == 0 && ref->weak == 0) - binder_delete_ref(ref); - return 0; -} - -static void binder_pop_transaction(struct binder_thread *target_thread, - struct binder_transaction *t) -{ - if (target_thread) { - BUG_ON(target_thread->transaction_stack != t); - BUG_ON(target_thread->transaction_stack->from != target_thread); - target_thread->transaction_stack = - target_thread->transaction_stack->from_parent; - t->from = NULL; - } - t->need_reply = 0; - if (t->buffer) - t->buffer->transaction = NULL; - kfree(t); - binder_stats_deleted(BINDER_STAT_TRANSACTION); -} - -static void binder_send_failed_reply(struct binder_transaction *t, - uint32_t error_code) -{ - struct binder_thread *target_thread; - struct binder_transaction *next; - - BUG_ON(t->flags & TF_ONE_WAY); - while (1) { - target_thread = t->from; - if (target_thread) { - if (target_thread->return_error != BR_OK && - target_thread->return_error2 == BR_OK) { - target_thread->return_error2 = - target_thread->return_error; - target_thread->return_error = BR_OK; - } - if (target_thread->return_error == BR_OK) { - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "send failed reply for transaction %d to %d:%d\n", - t->debug_id, - target_thread->proc->pid, - target_thread->pid); - - binder_pop_transaction(target_thread, t); - target_thread->return_error = error_code; - wake_up_interruptible(&target_thread->wait); - } else { - pr_err("reply failed, target thread, %d:%d, has error code %d already\n", - target_thread->proc->pid, - target_thread->pid, - target_thread->return_error); - } - return; - } - next = t->from_parent; - - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "send failed reply for transaction %d, target dead\n", - t->debug_id); - - binder_pop_transaction(target_thread, t); - if (next == NULL) { - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "reply failed, no target thread at root\n"); - return; - } - t = next; - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "reply failed, no target thread -- retry %d\n", - t->debug_id); - } -} - -static void binder_transaction_buffer_release(struct binder_proc *proc, - struct binder_buffer *buffer, - binder_size_t *failed_at) -{ - binder_size_t *offp, *off_end; - int debug_id = buffer->debug_id; - - binder_debug(BINDER_DEBUG_TRANSACTION, - "%d buffer release %d, size %zd-%zd, failed at %p\n", - proc->pid, buffer->debug_id, - buffer->data_size, buffer->offsets_size, failed_at); - - if (buffer->target_node) - binder_dec_node(buffer->target_node, 1, 0); - - offp = (binder_size_t *)(buffer->data + - ALIGN(buffer->data_size, sizeof(void *))); - if (failed_at) - off_end = failed_at; - else - off_end = (void *)offp + buffer->offsets_size; - for (; offp < off_end; offp++) { - struct flat_binder_object *fp; - - if (*offp > buffer->data_size - sizeof(*fp) || - buffer->data_size < sizeof(*fp) || - !IS_ALIGNED(*offp, sizeof(u32))) { - pr_err("transaction release %d bad offset %lld, size %zd\n", - debug_id, (u64)*offp, buffer->data_size); - continue; - } - fp = (struct flat_binder_object *)(buffer->data + *offp); - switch (fp->type) { - case BINDER_TYPE_BINDER: - case BINDER_TYPE_WEAK_BINDER: { - struct binder_node *node = binder_get_node(proc, fp->binder); - - if (node == NULL) { - pr_err("transaction release %d bad node %016llx\n", - debug_id, (u64)fp->binder); - break; - } - binder_debug(BINDER_DEBUG_TRANSACTION, - " node %d u%016llx\n", - node->debug_id, (u64)node->ptr); - binder_dec_node(node, fp->type == BINDER_TYPE_BINDER, 0); - } break; - case BINDER_TYPE_HANDLE: - case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle); - - if (ref == NULL) { - pr_err("transaction release %d bad handle %d\n", - debug_id, fp->handle); - break; - } - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, ref->node->debug_id); - binder_dec_ref(ref, fp->type == BINDER_TYPE_HANDLE); - } break; - - case BINDER_TYPE_FD: - binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d\n", fp->handle); - if (failed_at) - task_close_fd(proc, fp->handle); - break; - - default: - pr_err("transaction release %d bad object type %x\n", - debug_id, fp->type); - break; - } - } -} - -static void binder_transaction(struct binder_proc *proc, - struct binder_thread *thread, - struct binder_transaction_data *tr, int reply) -{ - struct binder_transaction *t; - struct binder_work *tcomplete; - binder_size_t *offp, *off_end; - struct binder_proc *target_proc; - struct binder_thread *target_thread = NULL; - struct binder_node *target_node = NULL; - struct list_head *target_list; - wait_queue_head_t *target_wait; - struct binder_transaction *in_reply_to = NULL; - struct binder_transaction_log_entry *e; - uint32_t return_error; - - e = binder_transaction_log_add(&binder_transaction_log); - e->call_type = reply ? 2 : !!(tr->flags & TF_ONE_WAY); - e->from_proc = proc->pid; - e->from_thread = thread->pid; - e->target_handle = tr->target.handle; - e->data_size = tr->data_size; - e->offsets_size = tr->offsets_size; - - if (reply) { - in_reply_to = thread->transaction_stack; - if (in_reply_to == NULL) { - binder_user_error("%d:%d got reply transaction with no transaction stack\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - goto err_empty_call_stack; - } - binder_set_nice(in_reply_to->saved_priority); - if (in_reply_to->to_thread != thread) { - binder_user_error("%d:%d got reply transaction with bad transaction stack, transaction %d has target %d:%d\n", - proc->pid, thread->pid, in_reply_to->debug_id, - in_reply_to->to_proc ? - in_reply_to->to_proc->pid : 0, - in_reply_to->to_thread ? - in_reply_to->to_thread->pid : 0); - return_error = BR_FAILED_REPLY; - in_reply_to = NULL; - goto err_bad_call_stack; - } - thread->transaction_stack = in_reply_to->to_parent; - target_thread = in_reply_to->from; - if (target_thread == NULL) { - return_error = BR_DEAD_REPLY; - goto err_dead_binder; - } - if (target_thread->transaction_stack != in_reply_to) { - binder_user_error("%d:%d got reply transaction with bad target transaction stack %d, expected %d\n", - proc->pid, thread->pid, - target_thread->transaction_stack ? - target_thread->transaction_stack->debug_id : 0, - in_reply_to->debug_id); - return_error = BR_FAILED_REPLY; - in_reply_to = NULL; - target_thread = NULL; - goto err_dead_binder; - } - target_proc = target_thread->proc; - } else { - if (tr->target.handle) { - struct binder_ref *ref; - - ref = binder_get_ref(proc, tr->target.handle); - if (ref == NULL) { - binder_user_error("%d:%d got transaction to invalid handle\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - goto err_invalid_target_handle; - } - target_node = ref->node; - } else { - target_node = binder_context_mgr_node; - if (target_node == NULL) { - return_error = BR_DEAD_REPLY; - goto err_no_context_mgr_node; - } - } - e->to_node = target_node->debug_id; - target_proc = target_node->proc; - if (target_proc == NULL) { - return_error = BR_DEAD_REPLY; - goto err_dead_binder; - } - if (!(tr->flags & TF_ONE_WAY) && thread->transaction_stack) { - struct binder_transaction *tmp; - - tmp = thread->transaction_stack; - if (tmp->to_thread != thread) { - binder_user_error("%d:%d got new transaction with bad transaction stack, transaction %d has target %d:%d\n", - proc->pid, thread->pid, tmp->debug_id, - tmp->to_proc ? tmp->to_proc->pid : 0, - tmp->to_thread ? - tmp->to_thread->pid : 0); - return_error = BR_FAILED_REPLY; - goto err_bad_call_stack; - } - while (tmp) { - if (tmp->from && tmp->from->proc == target_proc) - target_thread = tmp->from; - tmp = tmp->from_parent; - } - } - } - if (target_thread) { - e->to_thread = target_thread->pid; - target_list = &target_thread->todo; - target_wait = &target_thread->wait; - } else { - target_list = &target_proc->todo; - target_wait = &target_proc->wait; - } - e->to_proc = target_proc->pid; - - /* TODO: reuse incoming transaction for reply */ - t = kzalloc(sizeof(*t), GFP_KERNEL); - if (t == NULL) { - return_error = BR_FAILED_REPLY; - goto err_alloc_t_failed; - } - binder_stats_created(BINDER_STAT_TRANSACTION); - - tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL); - if (tcomplete == NULL) { - return_error = BR_FAILED_REPLY; - goto err_alloc_tcomplete_failed; - } - binder_stats_created(BINDER_STAT_TRANSACTION_COMPLETE); - - t->debug_id = ++binder_last_id; - e->debug_id = t->debug_id; - - if (reply) - binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d BC_REPLY %d -> %d:%d, data %016llx-%016llx size %lld-%lld\n", - proc->pid, thread->pid, t->debug_id, - target_proc->pid, target_thread->pid, - (u64)tr->data.ptr.buffer, - (u64)tr->data.ptr.offsets, - (u64)tr->data_size, (u64)tr->offsets_size); - else - binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d BC_TRANSACTION %d -> %d - node %d, data %016llx-%016llx size %lld-%lld\n", - proc->pid, thread->pid, t->debug_id, - target_proc->pid, target_node->debug_id, - (u64)tr->data.ptr.buffer, - (u64)tr->data.ptr.offsets, - (u64)tr->data_size, (u64)tr->offsets_size); - - if (!reply && !(tr->flags & TF_ONE_WAY)) - t->from = thread; - else - t->from = NULL; - t->sender_euid = task_euid(proc->tsk); - t->to_proc = target_proc; - t->to_thread = target_thread; - t->code = tr->code; - t->flags = tr->flags; - t->priority = task_nice(current); - - trace_binder_transaction(reply, t, target_node); - - t->buffer = binder_alloc_buf(target_proc, tr->data_size, - tr->offsets_size, !reply && (t->flags & TF_ONE_WAY)); - if (t->buffer == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_alloc_buf_failed; - } - t->buffer->allow_user_free = 0; - t->buffer->debug_id = t->debug_id; - t->buffer->transaction = t; - t->buffer->target_node = target_node; - trace_binder_transaction_alloc_buf(t->buffer); - if (target_node) - binder_inc_node(target_node, 1, 0, NULL); - - offp = (binder_size_t *)(t->buffer->data + - ALIGN(tr->data_size, sizeof(void *))); - - if (copy_from_user(t->buffer->data, (const void __user *)(uintptr_t) - tr->data.ptr.buffer, tr->data_size)) { - binder_user_error("%d:%d got transaction with invalid data ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - goto err_copy_data_failed; - } - if (copy_from_user(offp, (const void __user *)(uintptr_t) - tr->data.ptr.offsets, tr->offsets_size)) { - binder_user_error("%d:%d got transaction with invalid offsets ptr\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - goto err_copy_data_failed; - } - if (!IS_ALIGNED(tr->offsets_size, sizeof(binder_size_t))) { - binder_user_error("%d:%d got transaction with invalid offsets size, %lld\n", - proc->pid, thread->pid, (u64)tr->offsets_size); - return_error = BR_FAILED_REPLY; - goto err_bad_offset; - } - off_end = (void *)offp + tr->offsets_size; - for (; offp < off_end; offp++) { - struct flat_binder_object *fp; - - if (*offp > t->buffer->data_size - sizeof(*fp) || - t->buffer->data_size < sizeof(*fp) || - !IS_ALIGNED(*offp, sizeof(u32))) { - binder_user_error("%d:%d got transaction with invalid offset, %lld\n", - proc->pid, thread->pid, (u64)*offp); - return_error = BR_FAILED_REPLY; - goto err_bad_offset; - } - fp = (struct flat_binder_object *)(t->buffer->data + *offp); - switch (fp->type) { - case BINDER_TYPE_BINDER: - case BINDER_TYPE_WEAK_BINDER: { - struct binder_ref *ref; - struct binder_node *node = binder_get_node(proc, fp->binder); - - if (node == NULL) { - node = binder_new_node(proc, fp->binder, fp->cookie); - if (node == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_new_node_failed; - } - node->min_priority = fp->flags & FLAT_BINDER_FLAG_PRIORITY_MASK; - node->accept_fds = !!(fp->flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); - } - if (fp->cookie != node->cookie) { - binder_user_error("%d:%d sending u%016llx node %d, cookie mismatch %016llx != %016llx\n", - proc->pid, thread->pid, - (u64)fp->binder, node->debug_id, - (u64)fp->cookie, (u64)node->cookie); - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - ref = binder_get_ref_for_node(target_proc, node); - if (ref == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - if (fp->type == BINDER_TYPE_BINDER) - fp->type = BINDER_TYPE_HANDLE; - else - fp->type = BINDER_TYPE_WEAK_HANDLE; - fp->handle = ref->desc; - binder_inc_ref(ref, fp->type == BINDER_TYPE_HANDLE, - &thread->todo); - - trace_binder_transaction_node_to_ref(t, node, ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " node %d u%016llx -> ref %d desc %d\n", - node->debug_id, (u64)node->ptr, - ref->debug_id, ref->desc); - } break; - case BINDER_TYPE_HANDLE: - case BINDER_TYPE_WEAK_HANDLE: { - struct binder_ref *ref = binder_get_ref(proc, fp->handle); - - if (ref == NULL) { - binder_user_error("%d:%d got transaction with invalid handle, %d\n", - proc->pid, - thread->pid, fp->handle); - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_failed; - } - if (ref->node->proc == target_proc) { - if (fp->type == BINDER_TYPE_HANDLE) - fp->type = BINDER_TYPE_BINDER; - else - fp->type = BINDER_TYPE_WEAK_BINDER; - fp->binder = ref->node->ptr; - fp->cookie = ref->node->cookie; - binder_inc_node(ref->node, fp->type == BINDER_TYPE_BINDER, 0, NULL); - trace_binder_transaction_ref_to_node(t, ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> node %d u%016llx\n", - ref->debug_id, ref->desc, ref->node->debug_id, - (u64)ref->node->ptr); - } else { - struct binder_ref *new_ref; - - new_ref = binder_get_ref_for_node(target_proc, ref->node); - if (new_ref == NULL) { - return_error = BR_FAILED_REPLY; - goto err_binder_get_ref_for_node_failed; - } - fp->handle = new_ref->desc; - binder_inc_ref(new_ref, fp->type == BINDER_TYPE_HANDLE, NULL); - trace_binder_transaction_ref_to_ref(t, ref, - new_ref); - binder_debug(BINDER_DEBUG_TRANSACTION, - " ref %d desc %d -> ref %d desc %d (node %d)\n", - ref->debug_id, ref->desc, new_ref->debug_id, - new_ref->desc, ref->node->debug_id); - } - } break; - - case BINDER_TYPE_FD: { - int target_fd; - struct file *file; - - if (reply) { - if (!(in_reply_to->flags & TF_ACCEPT_FDS)) { - binder_user_error("%d:%d got reply with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->handle); - return_error = BR_FAILED_REPLY; - goto err_fd_not_allowed; - } - } else if (!target_node->accept_fds) { - binder_user_error("%d:%d got transaction with fd, %d, but target does not allow fds\n", - proc->pid, thread->pid, fp->handle); - return_error = BR_FAILED_REPLY; - goto err_fd_not_allowed; - } - - file = fget(fp->handle); - if (file == NULL) { - binder_user_error("%d:%d got transaction with invalid fd, %d\n", - proc->pid, thread->pid, fp->handle); - return_error = BR_FAILED_REPLY; - goto err_fget_failed; - } - target_fd = task_get_unused_fd_flags(target_proc, O_CLOEXEC); - if (target_fd < 0) { - fput(file); - return_error = BR_FAILED_REPLY; - goto err_get_unused_fd_failed; - } - task_fd_install(target_proc, target_fd, file); - trace_binder_transaction_fd(t, fp->handle, target_fd); - binder_debug(BINDER_DEBUG_TRANSACTION, - " fd %d -> %d\n", fp->handle, target_fd); - /* TODO: fput? */ - fp->handle = target_fd; - } break; - - default: - binder_user_error("%d:%d got transaction with invalid object type, %x\n", - proc->pid, thread->pid, fp->type); - return_error = BR_FAILED_REPLY; - goto err_bad_object_type; - } - } - if (reply) { - BUG_ON(t->buffer->async_transaction != 0); - binder_pop_transaction(target_thread, in_reply_to); - } else if (!(t->flags & TF_ONE_WAY)) { - BUG_ON(t->buffer->async_transaction != 0); - t->need_reply = 1; - t->from_parent = thread->transaction_stack; - thread->transaction_stack = t; - } else { - BUG_ON(target_node == NULL); - BUG_ON(t->buffer->async_transaction != 1); - if (target_node->has_async_transaction) { - target_list = &target_node->async_todo; - target_wait = NULL; - } else - target_node->has_async_transaction = 1; - } - t->work.type = BINDER_WORK_TRANSACTION; - list_add_tail(&t->work.entry, target_list); - tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; - list_add_tail(&tcomplete->entry, &thread->todo); - if (target_wait) - wake_up_interruptible(target_wait); - return; - -err_get_unused_fd_failed: -err_fget_failed: -err_fd_not_allowed: -err_binder_get_ref_for_node_failed: -err_binder_get_ref_failed: -err_binder_new_node_failed: -err_bad_object_type: -err_bad_offset: -err_copy_data_failed: - trace_binder_transaction_failed_buffer_release(t->buffer); - binder_transaction_buffer_release(target_proc, t->buffer, offp); - t->buffer->transaction = NULL; - binder_free_buf(target_proc, t->buffer); -err_binder_alloc_buf_failed: - kfree(tcomplete); - binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); -err_alloc_tcomplete_failed: - kfree(t); - binder_stats_deleted(BINDER_STAT_TRANSACTION); -err_alloc_t_failed: -err_bad_call_stack: -err_empty_call_stack: -err_dead_binder: -err_invalid_target_handle: -err_no_context_mgr_node: - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "%d:%d transaction failed %d, size %lld-%lld\n", - proc->pid, thread->pid, return_error, - (u64)tr->data_size, (u64)tr->offsets_size); - - { - struct binder_transaction_log_entry *fe; - - fe = binder_transaction_log_add(&binder_transaction_log_failed); - *fe = *e; - } - - BUG_ON(thread->return_error != BR_OK); - if (in_reply_to) { - thread->return_error = BR_TRANSACTION_COMPLETE; - binder_send_failed_reply(in_reply_to, return_error); - } else - thread->return_error = return_error; -} - -static int binder_thread_write(struct binder_proc *proc, - struct binder_thread *thread, - binder_uintptr_t binder_buffer, size_t size, - binder_size_t *consumed) -{ - uint32_t cmd; - void __user *buffer = (void __user *)(uintptr_t)binder_buffer; - void __user *ptr = buffer + *consumed; - void __user *end = buffer + size; - - while (ptr < end && thread->return_error == BR_OK) { - if (get_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - trace_binder_command(cmd); - if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.bc)) { - binder_stats.bc[_IOC_NR(cmd)]++; - proc->stats.bc[_IOC_NR(cmd)]++; - thread->stats.bc[_IOC_NR(cmd)]++; - } - switch (cmd) { - case BC_INCREFS: - case BC_ACQUIRE: - case BC_RELEASE: - case BC_DECREFS: { - uint32_t target; - struct binder_ref *ref; - const char *debug_string; - - if (get_user(target, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (target == 0 && binder_context_mgr_node && - (cmd == BC_INCREFS || cmd == BC_ACQUIRE)) { - ref = binder_get_ref_for_node(proc, - binder_context_mgr_node); - if (ref->desc != target) { - binder_user_error("%d:%d tried to acquire reference to desc 0, got %d instead\n", - proc->pid, thread->pid, - ref->desc); - } - } else - ref = binder_get_ref(proc, target); - if (ref == NULL) { - binder_user_error("%d:%d refcount change on invalid ref %d\n", - proc->pid, thread->pid, target); - break; - } - switch (cmd) { - case BC_INCREFS: - debug_string = "IncRefs"; - binder_inc_ref(ref, 0, NULL); - break; - case BC_ACQUIRE: - debug_string = "Acquire"; - binder_inc_ref(ref, 1, NULL); - break; - case BC_RELEASE: - debug_string = "Release"; - binder_dec_ref(ref, 1); - break; - case BC_DECREFS: - default: - debug_string = "DecRefs"; - binder_dec_ref(ref, 0); - break; - } - binder_debug(BINDER_DEBUG_USER_REFS, - "%d:%d %s ref %d desc %d s %d w %d for node %d\n", - proc->pid, thread->pid, debug_string, ref->debug_id, - ref->desc, ref->strong, ref->weak, ref->node->debug_id); - break; - } - case BC_INCREFS_DONE: - case BC_ACQUIRE_DONE: { - binder_uintptr_t node_ptr; - binder_uintptr_t cookie; - struct binder_node *node; - - if (get_user(node_ptr, (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - if (get_user(cookie, (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - node = binder_get_node(proc, node_ptr); - if (node == NULL) { - binder_user_error("%d:%d %s u%016llx no match\n", - proc->pid, thread->pid, - cmd == BC_INCREFS_DONE ? - "BC_INCREFS_DONE" : - "BC_ACQUIRE_DONE", - (u64)node_ptr); - break; - } - if (cookie != node->cookie) { - binder_user_error("%d:%d %s u%016llx node %d cookie mismatch %016llx != %016llx\n", - proc->pid, thread->pid, - cmd == BC_INCREFS_DONE ? - "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", - (u64)node_ptr, node->debug_id, - (u64)cookie, (u64)node->cookie); - break; - } - if (cmd == BC_ACQUIRE_DONE) { - if (node->pending_strong_ref == 0) { - binder_user_error("%d:%d BC_ACQUIRE_DONE node %d has no pending acquire request\n", - proc->pid, thread->pid, - node->debug_id); - break; - } - node->pending_strong_ref = 0; - } else { - if (node->pending_weak_ref == 0) { - binder_user_error("%d:%d BC_INCREFS_DONE node %d has no pending increfs request\n", - proc->pid, thread->pid, - node->debug_id); - break; - } - node->pending_weak_ref = 0; - } - binder_dec_node(node, cmd == BC_ACQUIRE_DONE, 0); - binder_debug(BINDER_DEBUG_USER_REFS, - "%d:%d %s node %d ls %d lw %d\n", - proc->pid, thread->pid, - cmd == BC_INCREFS_DONE ? "BC_INCREFS_DONE" : "BC_ACQUIRE_DONE", - node->debug_id, node->local_strong_refs, node->local_weak_refs); - break; - } - case BC_ATTEMPT_ACQUIRE: - pr_err("BC_ATTEMPT_ACQUIRE not supported\n"); - return -EINVAL; - case BC_ACQUIRE_RESULT: - pr_err("BC_ACQUIRE_RESULT not supported\n"); - return -EINVAL; - - case BC_FREE_BUFFER: { - binder_uintptr_t data_ptr; - struct binder_buffer *buffer; - - if (get_user(data_ptr, (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - - buffer = binder_buffer_lookup(proc, data_ptr); - if (buffer == NULL) { - binder_user_error("%d:%d BC_FREE_BUFFER u%016llx no match\n", - proc->pid, thread->pid, (u64)data_ptr); - break; - } - if (!buffer->allow_user_free) { - binder_user_error("%d:%d BC_FREE_BUFFER u%016llx matched unreturned buffer\n", - proc->pid, thread->pid, (u64)data_ptr); - break; - } - binder_debug(BINDER_DEBUG_FREE_BUFFER, - "%d:%d BC_FREE_BUFFER u%016llx found buffer %d for %s transaction\n", - proc->pid, thread->pid, (u64)data_ptr, - buffer->debug_id, - buffer->transaction ? "active" : "finished"); - - if (buffer->transaction) { - buffer->transaction->buffer = NULL; - buffer->transaction = NULL; - } - if (buffer->async_transaction && buffer->target_node) { - BUG_ON(!buffer->target_node->has_async_transaction); - if (list_empty(&buffer->target_node->async_todo)) - buffer->target_node->has_async_transaction = 0; - else - list_move_tail(buffer->target_node->async_todo.next, &thread->todo); - } - trace_binder_transaction_buffer_release(buffer); - binder_transaction_buffer_release(proc, buffer, NULL); - binder_free_buf(proc, buffer); - break; - } - - case BC_TRANSACTION: - case BC_REPLY: { - struct binder_transaction_data tr; - - if (copy_from_user(&tr, ptr, sizeof(tr))) - return -EFAULT; - ptr += sizeof(tr); - binder_transaction(proc, thread, &tr, cmd == BC_REPLY); - break; - } - - case BC_REGISTER_LOOPER: - binder_debug(BINDER_DEBUG_THREADS, - "%d:%d BC_REGISTER_LOOPER\n", - proc->pid, thread->pid); - if (thread->looper & BINDER_LOOPER_STATE_ENTERED) { - thread->looper |= BINDER_LOOPER_STATE_INVALID; - binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called after BC_ENTER_LOOPER\n", - proc->pid, thread->pid); - } else if (proc->requested_threads == 0) { - thread->looper |= BINDER_LOOPER_STATE_INVALID; - binder_user_error("%d:%d ERROR: BC_REGISTER_LOOPER called without request\n", - proc->pid, thread->pid); - } else { - proc->requested_threads--; - proc->requested_threads_started++; - } - thread->looper |= BINDER_LOOPER_STATE_REGISTERED; - break; - case BC_ENTER_LOOPER: - binder_debug(BINDER_DEBUG_THREADS, - "%d:%d BC_ENTER_LOOPER\n", - proc->pid, thread->pid); - if (thread->looper & BINDER_LOOPER_STATE_REGISTERED) { - thread->looper |= BINDER_LOOPER_STATE_INVALID; - binder_user_error("%d:%d ERROR: BC_ENTER_LOOPER called after BC_REGISTER_LOOPER\n", - proc->pid, thread->pid); - } - thread->looper |= BINDER_LOOPER_STATE_ENTERED; - break; - case BC_EXIT_LOOPER: - binder_debug(BINDER_DEBUG_THREADS, - "%d:%d BC_EXIT_LOOPER\n", - proc->pid, thread->pid); - thread->looper |= BINDER_LOOPER_STATE_EXITED; - break; - - case BC_REQUEST_DEATH_NOTIFICATION: - case BC_CLEAR_DEATH_NOTIFICATION: { - uint32_t target; - binder_uintptr_t cookie; - struct binder_ref *ref; - struct binder_ref_death *death; - - if (get_user(target, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (get_user(cookie, (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - ref = binder_get_ref(proc, target); - if (ref == NULL) { - binder_user_error("%d:%d %s invalid ref %d\n", - proc->pid, thread->pid, - cmd == BC_REQUEST_DEATH_NOTIFICATION ? - "BC_REQUEST_DEATH_NOTIFICATION" : - "BC_CLEAR_DEATH_NOTIFICATION", - target); - break; - } - - binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, - "%d:%d %s %016llx ref %d desc %d s %d w %d for node %d\n", - proc->pid, thread->pid, - cmd == BC_REQUEST_DEATH_NOTIFICATION ? - "BC_REQUEST_DEATH_NOTIFICATION" : - "BC_CLEAR_DEATH_NOTIFICATION", - (u64)cookie, ref->debug_id, ref->desc, - ref->strong, ref->weak, ref->node->debug_id); - - if (cmd == BC_REQUEST_DEATH_NOTIFICATION) { - if (ref->death) { - binder_user_error("%d:%d BC_REQUEST_DEATH_NOTIFICATION death notification already set\n", - proc->pid, thread->pid); - break; - } - death = kzalloc(sizeof(*death), GFP_KERNEL); - if (death == NULL) { - thread->return_error = BR_ERROR; - binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, - "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", - proc->pid, thread->pid); - break; - } - binder_stats_created(BINDER_STAT_DEATH); - INIT_LIST_HEAD(&death->work.entry); - death->cookie = cookie; - ref->death = death; - if (ref->node->proc == NULL) { - ref->death->work.type = BINDER_WORK_DEAD_BINDER; - if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { - list_add_tail(&ref->death->work.entry, &thread->todo); - } else { - list_add_tail(&ref->death->work.entry, &proc->todo); - wake_up_interruptible(&proc->wait); - } - } - } else { - if (ref->death == NULL) { - binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification not active\n", - proc->pid, thread->pid); - break; - } - death = ref->death; - if (death->cookie != cookie) { - binder_user_error("%d:%d BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch %016llx != %016llx\n", - proc->pid, thread->pid, - (u64)death->cookie, - (u64)cookie); - break; - } - ref->death = NULL; - if (list_empty(&death->work.entry)) { - death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; - if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { - list_add_tail(&death->work.entry, &thread->todo); - } else { - list_add_tail(&death->work.entry, &proc->todo); - wake_up_interruptible(&proc->wait); - } - } else { - BUG_ON(death->work.type != BINDER_WORK_DEAD_BINDER); - death->work.type = BINDER_WORK_DEAD_BINDER_AND_CLEAR; - } - } - } break; - case BC_DEAD_BINDER_DONE: { - struct binder_work *w; - binder_uintptr_t cookie; - struct binder_ref_death *death = NULL; - - if (get_user(cookie, (binder_uintptr_t __user *)ptr)) - return -EFAULT; - - ptr += sizeof(void *); - list_for_each_entry(w, &proc->delivered_death, entry) { - struct binder_ref_death *tmp_death = container_of(w, struct binder_ref_death, work); - - if (tmp_death->cookie == cookie) { - death = tmp_death; - break; - } - } - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "%d:%d BC_DEAD_BINDER_DONE %016llx found %p\n", - proc->pid, thread->pid, (u64)cookie, - death); - if (death == NULL) { - binder_user_error("%d:%d BC_DEAD_BINDER_DONE %016llx not found\n", - proc->pid, thread->pid, (u64)cookie); - break; - } - - list_del_init(&death->work.entry); - if (death->work.type == BINDER_WORK_DEAD_BINDER_AND_CLEAR) { - death->work.type = BINDER_WORK_CLEAR_DEATH_NOTIFICATION; - if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) { - list_add_tail(&death->work.entry, &thread->todo); - } else { - list_add_tail(&death->work.entry, &proc->todo); - wake_up_interruptible(&proc->wait); - } - } - } break; - - default: - pr_err("%d:%d unknown command %d\n", - proc->pid, thread->pid, cmd); - return -EINVAL; - } - *consumed = ptr - buffer; - } - return 0; -} - -static void binder_stat_br(struct binder_proc *proc, - struct binder_thread *thread, uint32_t cmd) -{ - trace_binder_return(cmd); - if (_IOC_NR(cmd) < ARRAY_SIZE(binder_stats.br)) { - binder_stats.br[_IOC_NR(cmd)]++; - proc->stats.br[_IOC_NR(cmd)]++; - thread->stats.br[_IOC_NR(cmd)]++; - } -} - -static int binder_has_proc_work(struct binder_proc *proc, - struct binder_thread *thread) -{ - return !list_empty(&proc->todo) || - (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); -} - -static int binder_has_thread_work(struct binder_thread *thread) -{ - return !list_empty(&thread->todo) || thread->return_error != BR_OK || - (thread->looper & BINDER_LOOPER_STATE_NEED_RETURN); -} - -static int binder_thread_read(struct binder_proc *proc, - struct binder_thread *thread, - binder_uintptr_t binder_buffer, size_t size, - binder_size_t *consumed, int non_block) -{ - void __user *buffer = (void __user *)(uintptr_t)binder_buffer; - void __user *ptr = buffer + *consumed; - void __user *end = buffer + size; - - int ret = 0; - int wait_for_proc_work; - - if (*consumed == 0) { - if (put_user(BR_NOOP, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - } - -retry: - wait_for_proc_work = thread->transaction_stack == NULL && - list_empty(&thread->todo); - - if (thread->return_error != BR_OK && ptr < end) { - if (thread->return_error2 != BR_OK) { - if (put_user(thread->return_error2, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, thread->return_error2); - if (ptr == end) - goto done; - thread->return_error2 = BR_OK; - } - if (put_user(thread->return_error, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - binder_stat_br(proc, thread, thread->return_error); - thread->return_error = BR_OK; - goto done; - } - - - thread->looper |= BINDER_LOOPER_STATE_WAITING; - if (wait_for_proc_work) - proc->ready_threads++; - - binder_unlock(__func__); - - trace_binder_wait_for_work(wait_for_proc_work, - !!thread->transaction_stack, - !list_empty(&thread->todo)); - if (wait_for_proc_work) { - if (!(thread->looper & (BINDER_LOOPER_STATE_REGISTERED | - BINDER_LOOPER_STATE_ENTERED))) { - binder_user_error("%d:%d ERROR: Thread waiting for process work before calling BC_REGISTER_LOOPER or BC_ENTER_LOOPER (state %x)\n", - proc->pid, thread->pid, thread->looper); - wait_event_interruptible(binder_user_error_wait, - binder_stop_on_user_error < 2); - } - binder_set_nice(proc->default_priority); - if (non_block) { - if (!binder_has_proc_work(proc, thread)) - ret = -EAGAIN; - } else - ret = wait_event_freezable_exclusive(proc->wait, binder_has_proc_work(proc, thread)); - } else { - if (non_block) { - if (!binder_has_thread_work(thread)) - ret = -EAGAIN; - } else - ret = wait_event_freezable(thread->wait, binder_has_thread_work(thread)); - } - - binder_lock(__func__); - - if (wait_for_proc_work) - proc->ready_threads--; - thread->looper &= ~BINDER_LOOPER_STATE_WAITING; - - if (ret) - return ret; - - while (1) { - uint32_t cmd; - struct binder_transaction_data tr; - struct binder_work *w; - struct binder_transaction *t = NULL; - - if (!list_empty(&thread->todo)) { - w = list_first_entry(&thread->todo, struct binder_work, - entry); - } else if (!list_empty(&proc->todo) && wait_for_proc_work) { - w = list_first_entry(&proc->todo, struct binder_work, - entry); - } else { - /* no data added */ - if (ptr - buffer == 4 && - !(thread->looper & BINDER_LOOPER_STATE_NEED_RETURN)) - goto retry; - break; - } - - if (end - ptr < sizeof(tr) + 4) - break; - - switch (w->type) { - case BINDER_WORK_TRANSACTION: { - t = container_of(w, struct binder_transaction, work); - } break; - case BINDER_WORK_TRANSACTION_COMPLETE: { - cmd = BR_TRANSACTION_COMPLETE; - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - - binder_stat_br(proc, thread, cmd); - binder_debug(BINDER_DEBUG_TRANSACTION_COMPLETE, - "%d:%d BR_TRANSACTION_COMPLETE\n", - proc->pid, thread->pid); - - list_del(&w->entry); - kfree(w); - binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); - } break; - case BINDER_WORK_NODE: { - struct binder_node *node = container_of(w, struct binder_node, work); - uint32_t cmd = BR_NOOP; - const char *cmd_name; - int strong = node->internal_strong_refs || node->local_strong_refs; - int weak = !hlist_empty(&node->refs) || node->local_weak_refs || strong; - - if (weak && !node->has_weak_ref) { - cmd = BR_INCREFS; - cmd_name = "BR_INCREFS"; - node->has_weak_ref = 1; - node->pending_weak_ref = 1; - node->local_weak_refs++; - } else if (strong && !node->has_strong_ref) { - cmd = BR_ACQUIRE; - cmd_name = "BR_ACQUIRE"; - node->has_strong_ref = 1; - node->pending_strong_ref = 1; - node->local_strong_refs++; - } else if (!strong && node->has_strong_ref) { - cmd = BR_RELEASE; - cmd_name = "BR_RELEASE"; - node->has_strong_ref = 0; - } else if (!weak && node->has_weak_ref) { - cmd = BR_DECREFS; - cmd_name = "BR_DECREFS"; - node->has_weak_ref = 0; - } - if (cmd != BR_NOOP) { - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (put_user(node->ptr, - (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - if (put_user(node->cookie, - (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - - binder_stat_br(proc, thread, cmd); - binder_debug(BINDER_DEBUG_USER_REFS, - "%d:%d %s %d u%016llx c%016llx\n", - proc->pid, thread->pid, cmd_name, - node->debug_id, - (u64)node->ptr, (u64)node->cookie); - } else { - list_del_init(&w->entry); - if (!weak && !strong) { - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d:%d node %d u%016llx c%016llx deleted\n", - proc->pid, thread->pid, - node->debug_id, - (u64)node->ptr, - (u64)node->cookie); - rb_erase(&node->rb_node, &proc->nodes); - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); - } else { - binder_debug(BINDER_DEBUG_INTERNAL_REFS, - "%d:%d node %d u%016llx c%016llx state unchanged\n", - proc->pid, thread->pid, - node->debug_id, - (u64)node->ptr, - (u64)node->cookie); - } - } - } break; - case BINDER_WORK_DEAD_BINDER: - case BINDER_WORK_DEAD_BINDER_AND_CLEAR: - case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { - struct binder_ref_death *death; - uint32_t cmd; - - death = container_of(w, struct binder_ref_death, work); - if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) - cmd = BR_CLEAR_DEATH_NOTIFICATION_DONE; - else - cmd = BR_DEAD_BINDER; - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (put_user(death->cookie, - (binder_uintptr_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(binder_uintptr_t); - binder_stat_br(proc, thread, cmd); - binder_debug(BINDER_DEBUG_DEATH_NOTIFICATION, - "%d:%d %s %016llx\n", - proc->pid, thread->pid, - cmd == BR_DEAD_BINDER ? - "BR_DEAD_BINDER" : - "BR_CLEAR_DEATH_NOTIFICATION_DONE", - (u64)death->cookie); - - if (w->type == BINDER_WORK_CLEAR_DEATH_NOTIFICATION) { - list_del(&w->entry); - kfree(death); - binder_stats_deleted(BINDER_STAT_DEATH); - } else - list_move(&w->entry, &proc->delivered_death); - if (cmd == BR_DEAD_BINDER) - goto done; /* DEAD_BINDER notifications can cause transactions */ - } break; - } - - if (!t) - continue; - - BUG_ON(t->buffer == NULL); - if (t->buffer->target_node) { - struct binder_node *target_node = t->buffer->target_node; - - tr.target.ptr = target_node->ptr; - tr.cookie = target_node->cookie; - t->saved_priority = task_nice(current); - if (t->priority < target_node->min_priority && - !(t->flags & TF_ONE_WAY)) - binder_set_nice(t->priority); - else if (!(t->flags & TF_ONE_WAY) || - t->saved_priority > target_node->min_priority) - binder_set_nice(target_node->min_priority); - cmd = BR_TRANSACTION; - } else { - tr.target.ptr = 0; - tr.cookie = 0; - cmd = BR_REPLY; - } - tr.code = t->code; - tr.flags = t->flags; - tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid); - - if (t->from) { - struct task_struct *sender = t->from->proc->tsk; - - tr.sender_pid = task_tgid_nr_ns(sender, - task_active_pid_ns(current)); - } else { - tr.sender_pid = 0; - } - - tr.data_size = t->buffer->data_size; - tr.offsets_size = t->buffer->offsets_size; - tr.data.ptr.buffer = (binder_uintptr_t)( - (uintptr_t)t->buffer->data + - proc->user_buffer_offset); - tr.data.ptr.offsets = tr.data.ptr.buffer + - ALIGN(t->buffer->data_size, - sizeof(void *)); - - if (put_user(cmd, (uint32_t __user *)ptr)) - return -EFAULT; - ptr += sizeof(uint32_t); - if (copy_to_user(ptr, &tr, sizeof(tr))) - return -EFAULT; - ptr += sizeof(tr); - - trace_binder_transaction_received(t); - binder_stat_br(proc, thread, cmd); - binder_debug(BINDER_DEBUG_TRANSACTION, - "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %016llx-%016llx\n", - proc->pid, thread->pid, - (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : - "BR_REPLY", - t->debug_id, t->from ? t->from->proc->pid : 0, - t->from ? t->from->pid : 0, cmd, - t->buffer->data_size, t->buffer->offsets_size, - (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets); - - list_del(&t->work.entry); - t->buffer->allow_user_free = 1; - if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { - t->to_parent = thread->transaction_stack; - t->to_thread = thread; - thread->transaction_stack = t; - } else { - t->buffer->transaction = NULL; - kfree(t); - binder_stats_deleted(BINDER_STAT_TRANSACTION); - } - break; - } - -done: - - *consumed = ptr - buffer; - if (proc->requested_threads + proc->ready_threads == 0 && - proc->requested_threads_started < proc->max_threads && - (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | - BINDER_LOOPER_STATE_ENTERED)) /* the user-space code fails to */ - /*spawn a new thread if we leave this out */) { - proc->requested_threads++; - binder_debug(BINDER_DEBUG_THREADS, - "%d:%d BR_SPAWN_LOOPER\n", - proc->pid, thread->pid); - if (put_user(BR_SPAWN_LOOPER, (uint32_t __user *)buffer)) - return -EFAULT; - binder_stat_br(proc, thread, BR_SPAWN_LOOPER); - } - return 0; -} - -static void binder_release_work(struct list_head *list) -{ - struct binder_work *w; - - while (!list_empty(list)) { - w = list_first_entry(list, struct binder_work, entry); - list_del_init(&w->entry); - switch (w->type) { - case BINDER_WORK_TRANSACTION: { - struct binder_transaction *t; - - t = container_of(w, struct binder_transaction, work); - if (t->buffer->target_node && - !(t->flags & TF_ONE_WAY)) { - binder_send_failed_reply(t, BR_DEAD_REPLY); - } else { - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "undelivered transaction %d\n", - t->debug_id); - t->buffer->transaction = NULL; - kfree(t); - binder_stats_deleted(BINDER_STAT_TRANSACTION); - } - } break; - case BINDER_WORK_TRANSACTION_COMPLETE: { - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "undelivered TRANSACTION_COMPLETE\n"); - kfree(w); - binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); - } break; - case BINDER_WORK_DEAD_BINDER_AND_CLEAR: - case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: { - struct binder_ref_death *death; - - death = container_of(w, struct binder_ref_death, work); - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "undelivered death notification, %016llx\n", - (u64)death->cookie); - kfree(death); - binder_stats_deleted(BINDER_STAT_DEATH); - } break; - default: - pr_err("unexpected work type, %d, not freed\n", - w->type); - break; - } - } - -} - -static struct binder_thread *binder_get_thread(struct binder_proc *proc) -{ - struct binder_thread *thread = NULL; - struct rb_node *parent = NULL; - struct rb_node **p = &proc->threads.rb_node; - - while (*p) { - parent = *p; - thread = rb_entry(parent, struct binder_thread, rb_node); - - if (current->pid < thread->pid) - p = &(*p)->rb_left; - else if (current->pid > thread->pid) - p = &(*p)->rb_right; - else - break; - } - if (*p == NULL) { - thread = kzalloc(sizeof(*thread), GFP_KERNEL); - if (thread == NULL) - return NULL; - binder_stats_created(BINDER_STAT_THREAD); - thread->proc = proc; - thread->pid = current->pid; - init_waitqueue_head(&thread->wait); - INIT_LIST_HEAD(&thread->todo); - rb_link_node(&thread->rb_node, parent, p); - rb_insert_color(&thread->rb_node, &proc->threads); - thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN; - thread->return_error = BR_OK; - thread->return_error2 = BR_OK; - } - return thread; -} - -static int binder_free_thread(struct binder_proc *proc, - struct binder_thread *thread) -{ - struct binder_transaction *t; - struct binder_transaction *send_reply = NULL; - int active_transactions = 0; - - rb_erase(&thread->rb_node, &proc->threads); - t = thread->transaction_stack; - if (t && t->to_thread == thread) - send_reply = t; - while (t) { - active_transactions++; - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "release %d:%d transaction %d %s, still active\n", - proc->pid, thread->pid, - t->debug_id, - (t->to_thread == thread) ? "in" : "out"); - - if (t->to_thread == thread) { - t->to_proc = NULL; - t->to_thread = NULL; - if (t->buffer) { - t->buffer->transaction = NULL; - t->buffer = NULL; - } - t = t->to_parent; - } else if (t->from == thread) { - t->from = NULL; - t = t->from_parent; - } else - BUG(); - } - if (send_reply) - binder_send_failed_reply(send_reply, BR_DEAD_REPLY); - binder_release_work(&thread->todo); - kfree(thread); - binder_stats_deleted(BINDER_STAT_THREAD); - return active_transactions; -} - -static unsigned int binder_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct binder_proc *proc = filp->private_data; - struct binder_thread *thread = NULL; - int wait_for_proc_work; - - binder_lock(__func__); - - thread = binder_get_thread(proc); - - wait_for_proc_work = thread->transaction_stack == NULL && - list_empty(&thread->todo) && thread->return_error == BR_OK; - - binder_unlock(__func__); - - if (wait_for_proc_work) { - if (binder_has_proc_work(proc, thread)) - return POLLIN; - poll_wait(filp, &proc->wait, wait); - if (binder_has_proc_work(proc, thread)) - return POLLIN; - } else { - if (binder_has_thread_work(thread)) - return POLLIN; - poll_wait(filp, &thread->wait, wait); - if (binder_has_thread_work(thread)) - return POLLIN; - } - return 0; -} - -static int binder_ioctl_write_read(struct file *filp, - unsigned int cmd, unsigned long arg, - struct binder_thread *thread) -{ - int ret = 0; - struct binder_proc *proc = filp->private_data; - unsigned int size = _IOC_SIZE(cmd); - void __user *ubuf = (void __user *)arg; - struct binder_write_read bwr; - - if (size != sizeof(struct binder_write_read)) { - ret = -EINVAL; - goto out; - } - if (copy_from_user(&bwr, ubuf, sizeof(bwr))) { - ret = -EFAULT; - goto out; - } - binder_debug(BINDER_DEBUG_READ_WRITE, - "%d:%d write %lld at %016llx, read %lld at %016llx\n", - proc->pid, thread->pid, - (u64)bwr.write_size, (u64)bwr.write_buffer, - (u64)bwr.read_size, (u64)bwr.read_buffer); - - if (bwr.write_size > 0) { - ret = binder_thread_write(proc, thread, - bwr.write_buffer, - bwr.write_size, - &bwr.write_consumed); - trace_binder_write_done(ret); - if (ret < 0) { - bwr.read_consumed = 0; - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) - ret = -EFAULT; - goto out; - } - } - if (bwr.read_size > 0) { - ret = binder_thread_read(proc, thread, bwr.read_buffer, - bwr.read_size, - &bwr.read_consumed, - filp->f_flags & O_NONBLOCK); - trace_binder_read_done(ret); - if (!list_empty(&proc->todo)) - wake_up_interruptible(&proc->wait); - if (ret < 0) { - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) - ret = -EFAULT; - goto out; - } - } - binder_debug(BINDER_DEBUG_READ_WRITE, - "%d:%d wrote %lld of %lld, read return %lld of %lld\n", - proc->pid, thread->pid, - (u64)bwr.write_consumed, (u64)bwr.write_size, - (u64)bwr.read_consumed, (u64)bwr.read_size); - if (copy_to_user(ubuf, &bwr, sizeof(bwr))) { - ret = -EFAULT; - goto out; - } -out: - return ret; -} - -static int binder_ioctl_set_ctx_mgr(struct file *filp) -{ - int ret = 0; - struct binder_proc *proc = filp->private_data; - kuid_t curr_euid = current_euid(); - - if (binder_context_mgr_node != NULL) { - pr_err("BINDER_SET_CONTEXT_MGR already set\n"); - ret = -EBUSY; - goto out; - } - if (uid_valid(binder_context_mgr_uid)) { - if (!uid_eq(binder_context_mgr_uid, curr_euid)) { - pr_err("BINDER_SET_CONTEXT_MGR bad uid %d != %d\n", - from_kuid(&init_user_ns, curr_euid), - from_kuid(&init_user_ns, - binder_context_mgr_uid)); - ret = -EPERM; - goto out; - } - } else { - binder_context_mgr_uid = curr_euid; - } - binder_context_mgr_node = binder_new_node(proc, 0, 0); - if (binder_context_mgr_node == NULL) { - ret = -ENOMEM; - goto out; - } - binder_context_mgr_node->local_weak_refs++; - binder_context_mgr_node->local_strong_refs++; - binder_context_mgr_node->has_strong_ref = 1; - binder_context_mgr_node->has_weak_ref = 1; -out: - return ret; -} - -static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - int ret; - struct binder_proc *proc = filp->private_data; - struct binder_thread *thread; - unsigned int size = _IOC_SIZE(cmd); - void __user *ubuf = (void __user *)arg; - - /*pr_info("binder_ioctl: %d:%d %x %lx\n", - proc->pid, current->pid, cmd, arg);*/ - - trace_binder_ioctl(cmd, arg); - - ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); - if (ret) - goto err_unlocked; - - binder_lock(__func__); - thread = binder_get_thread(proc); - if (thread == NULL) { - ret = -ENOMEM; - goto err; - } - - switch (cmd) { - case BINDER_WRITE_READ: - ret = binder_ioctl_write_read(filp, cmd, arg, thread); - if (ret) - goto err; - break; - case BINDER_SET_MAX_THREADS: - if (copy_from_user(&proc->max_threads, ubuf, sizeof(proc->max_threads))) { - ret = -EINVAL; - goto err; - } - break; - case BINDER_SET_CONTEXT_MGR: - ret = binder_ioctl_set_ctx_mgr(filp); - if (ret) - goto err; - break; - case BINDER_THREAD_EXIT: - binder_debug(BINDER_DEBUG_THREADS, "%d:%d exit\n", - proc->pid, thread->pid); - binder_free_thread(proc, thread); - thread = NULL; - break; - case BINDER_VERSION: { - struct binder_version __user *ver = ubuf; - - if (size != sizeof(struct binder_version)) { - ret = -EINVAL; - goto err; - } - if (put_user(BINDER_CURRENT_PROTOCOL_VERSION, - &ver->protocol_version)) { - ret = -EINVAL; - goto err; - } - break; - } - default: - ret = -EINVAL; - goto err; - } - ret = 0; -err: - if (thread) - thread->looper &= ~BINDER_LOOPER_STATE_NEED_RETURN; - binder_unlock(__func__); - wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); - if (ret && ret != -ERESTARTSYS) - pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); -err_unlocked: - trace_binder_ioctl_done(ret); - return ret; -} - -static void binder_vma_open(struct vm_area_struct *vma) -{ - struct binder_proc *proc = vma->vm_private_data; - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "%d open vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", - proc->pid, vma->vm_start, vma->vm_end, - (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, - (unsigned long)pgprot_val(vma->vm_page_prot)); -} - -static void binder_vma_close(struct vm_area_struct *vma) -{ - struct binder_proc *proc = vma->vm_private_data; - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "%d close vm area %lx-%lx (%ld K) vma %lx pagep %lx\n", - proc->pid, vma->vm_start, vma->vm_end, - (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, - (unsigned long)pgprot_val(vma->vm_page_prot)); - proc->vma = NULL; - proc->vma_vm_mm = NULL; - binder_defer_work(proc, BINDER_DEFERRED_PUT_FILES); -} - -static int binder_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - return VM_FAULT_SIGBUS; -} - -static struct vm_operations_struct binder_vm_ops = { - .open = binder_vma_open, - .close = binder_vma_close, - .fault = binder_vm_fault, -}; - -static int binder_mmap(struct file *filp, struct vm_area_struct *vma) -{ - int ret; - struct vm_struct *area; - struct binder_proc *proc = filp->private_data; - const char *failure_string; - struct binder_buffer *buffer; - - if (proc->tsk != current) - return -EINVAL; - - if ((vma->vm_end - vma->vm_start) > SZ_4M) - vma->vm_end = vma->vm_start + SZ_4M; - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "binder_mmap: %d %lx-%lx (%ld K) vma %lx pagep %lx\n", - proc->pid, vma->vm_start, vma->vm_end, - (vma->vm_end - vma->vm_start) / SZ_1K, vma->vm_flags, - (unsigned long)pgprot_val(vma->vm_page_prot)); - - if (vma->vm_flags & FORBIDDEN_MMAP_FLAGS) { - ret = -EPERM; - failure_string = "bad vm_flags"; - goto err_bad_arg; - } - vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE; - - mutex_lock(&binder_mmap_lock); - if (proc->buffer) { - ret = -EBUSY; - failure_string = "already mapped"; - goto err_already_mapped; - } - - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP); - if (area == NULL) { - ret = -ENOMEM; - failure_string = "get_vm_area"; - goto err_get_vm_area_failed; - } - proc->buffer = area->addr; - proc->user_buffer_offset = vma->vm_start - (uintptr_t)proc->buffer; - mutex_unlock(&binder_mmap_lock); - -#ifdef CONFIG_CPU_CACHE_VIPT - if (cache_is_vipt_aliasing()) { - while (CACHE_COLOUR((vma->vm_start ^ (uint32_t)proc->buffer))) { - pr_info("binder_mmap: %d %lx-%lx maps %p bad alignment\n", proc->pid, vma->vm_start, vma->vm_end, proc->buffer); - vma->vm_start += PAGE_SIZE; - } - } -#endif - proc->pages = kzalloc(sizeof(proc->pages[0]) * ((vma->vm_end - vma->vm_start) / PAGE_SIZE), GFP_KERNEL); - if (proc->pages == NULL) { - ret = -ENOMEM; - failure_string = "alloc page array"; - goto err_alloc_pages_failed; - } - proc->buffer_size = vma->vm_end - vma->vm_start; - - vma->vm_ops = &binder_vm_ops; - vma->vm_private_data = proc; - - if (binder_update_page_range(proc, 1, proc->buffer, proc->buffer + PAGE_SIZE, vma)) { - ret = -ENOMEM; - failure_string = "alloc small buf"; - goto err_alloc_small_buf_failed; - } - buffer = proc->buffer; - INIT_LIST_HEAD(&proc->buffers); - list_add(&buffer->entry, &proc->buffers); - buffer->free = 1; - binder_insert_free_buffer(proc, buffer); - proc->free_async_space = proc->buffer_size / 2; - barrier(); - proc->files = get_files_struct(current); - proc->vma = vma; - proc->vma_vm_mm = vma->vm_mm; - - /*pr_info("binder_mmap: %d %lx-%lx maps %p\n", - proc->pid, vma->vm_start, vma->vm_end, proc->buffer);*/ - return 0; - -err_alloc_small_buf_failed: - kfree(proc->pages); - proc->pages = NULL; -err_alloc_pages_failed: - mutex_lock(&binder_mmap_lock); - vfree(proc->buffer); - proc->buffer = NULL; -err_get_vm_area_failed: -err_already_mapped: - mutex_unlock(&binder_mmap_lock); -err_bad_arg: - pr_err("binder_mmap: %d %lx-%lx %s failed %d\n", - proc->pid, vma->vm_start, vma->vm_end, failure_string, ret); - return ret; -} - -static int binder_open(struct inode *nodp, struct file *filp) -{ - struct binder_proc *proc; - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, "binder_open: %d:%d\n", - current->group_leader->pid, current->pid); - - proc = kzalloc(sizeof(*proc), GFP_KERNEL); - if (proc == NULL) - return -ENOMEM; - get_task_struct(current); - proc->tsk = current; - INIT_LIST_HEAD(&proc->todo); - init_waitqueue_head(&proc->wait); - proc->default_priority = task_nice(current); - - binder_lock(__func__); - - binder_stats_created(BINDER_STAT_PROC); - hlist_add_head(&proc->proc_node, &binder_procs); - proc->pid = current->group_leader->pid; - INIT_LIST_HEAD(&proc->delivered_death); - filp->private_data = proc; - - binder_unlock(__func__); - - if (binder_debugfs_dir_entry_proc) { - char strbuf[11]; - - snprintf(strbuf, sizeof(strbuf), "%u", proc->pid); - proc->debugfs_entry = debugfs_create_file(strbuf, S_IRUGO, - binder_debugfs_dir_entry_proc, proc, &binder_proc_fops); - } - - return 0; -} - -static int binder_flush(struct file *filp, fl_owner_t id) -{ - struct binder_proc *proc = filp->private_data; - - binder_defer_work(proc, BINDER_DEFERRED_FLUSH); - - return 0; -} - -static void binder_deferred_flush(struct binder_proc *proc) -{ - struct rb_node *n; - int wake_count = 0; - - for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) { - struct binder_thread *thread = rb_entry(n, struct binder_thread, rb_node); - - thread->looper |= BINDER_LOOPER_STATE_NEED_RETURN; - if (thread->looper & BINDER_LOOPER_STATE_WAITING) { - wake_up_interruptible(&thread->wait); - wake_count++; - } - } - wake_up_interruptible_all(&proc->wait); - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "binder_flush: %d woke %d threads\n", proc->pid, - wake_count); -} - -static int binder_release(struct inode *nodp, struct file *filp) -{ - struct binder_proc *proc = filp->private_data; - - debugfs_remove(proc->debugfs_entry); - binder_defer_work(proc, BINDER_DEFERRED_RELEASE); - - return 0; -} - -static int binder_node_release(struct binder_node *node, int refs) -{ - struct binder_ref *ref; - int death = 0; - - list_del_init(&node->work.entry); - binder_release_work(&node->async_todo); - - if (hlist_empty(&node->refs)) { - kfree(node); - binder_stats_deleted(BINDER_STAT_NODE); - - return refs; - } - - node->proc = NULL; - node->local_strong_refs = 0; - node->local_weak_refs = 0; - hlist_add_head(&node->dead_node, &binder_dead_nodes); - - hlist_for_each_entry(ref, &node->refs, node_entry) { - refs++; - - if (!ref->death) - continue; - - death++; - - if (list_empty(&ref->death->work.entry)) { - ref->death->work.type = BINDER_WORK_DEAD_BINDER; - list_add_tail(&ref->death->work.entry, - &ref->proc->todo); - wake_up_interruptible(&ref->proc->wait); - } else - BUG(); - } - - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "node %d now dead, refs %d, death %d\n", - node->debug_id, refs, death); - - return refs; -} - -static void binder_deferred_release(struct binder_proc *proc) -{ - struct binder_transaction *t; - struct rb_node *n; - int threads, nodes, incoming_refs, outgoing_refs, buffers, - active_transactions, page_count; - - BUG_ON(proc->vma); - BUG_ON(proc->files); - - hlist_del(&proc->proc_node); - - if (binder_context_mgr_node && binder_context_mgr_node->proc == proc) { - binder_debug(BINDER_DEBUG_DEAD_BINDER, - "%s: %d context_mgr_node gone\n", - __func__, proc->pid); - binder_context_mgr_node = NULL; - } - - threads = 0; - active_transactions = 0; - while ((n = rb_first(&proc->threads))) { - struct binder_thread *thread; - - thread = rb_entry(n, struct binder_thread, rb_node); - threads++; - active_transactions += binder_free_thread(proc, thread); - } - - nodes = 0; - incoming_refs = 0; - while ((n = rb_first(&proc->nodes))) { - struct binder_node *node; - - node = rb_entry(n, struct binder_node, rb_node); - nodes++; - rb_erase(&node->rb_node, &proc->nodes); - incoming_refs = binder_node_release(node, incoming_refs); - } - - outgoing_refs = 0; - while ((n = rb_first(&proc->refs_by_desc))) { - struct binder_ref *ref; - - ref = rb_entry(n, struct binder_ref, rb_node_desc); - outgoing_refs++; - binder_delete_ref(ref); - } - - binder_release_work(&proc->todo); - binder_release_work(&proc->delivered_death); - - buffers = 0; - while ((n = rb_first(&proc->allocated_buffers))) { - struct binder_buffer *buffer; - - buffer = rb_entry(n, struct binder_buffer, rb_node); - - t = buffer->transaction; - if (t) { - t->buffer = NULL; - buffer->transaction = NULL; - pr_err("release proc %d, transaction %d, not freed\n", - proc->pid, t->debug_id); - /*BUG();*/ - } - - binder_free_buf(proc, buffer); - buffers++; - } - - binder_stats_deleted(BINDER_STAT_PROC); - - page_count = 0; - if (proc->pages) { - int i; - - for (i = 0; i < proc->buffer_size / PAGE_SIZE; i++) { - void *page_addr; - - if (!proc->pages[i]) - continue; - - page_addr = proc->buffer + i * PAGE_SIZE; - binder_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%s: %d: page %d at %p not freed\n", - __func__, proc->pid, i, page_addr); - unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); - __free_page(proc->pages[i]); - page_count++; - } - kfree(proc->pages); - vfree(proc->buffer); - } - - put_task_struct(proc->tsk); - - binder_debug(BINDER_DEBUG_OPEN_CLOSE, - "%s: %d threads %d, nodes %d (ref %d), refs %d, active transactions %d, buffers %d, pages %d\n", - __func__, proc->pid, threads, nodes, incoming_refs, - outgoing_refs, active_transactions, buffers, page_count); - - kfree(proc); -} - -static void binder_deferred_func(struct work_struct *work) -{ - struct binder_proc *proc; - struct files_struct *files; - - int defer; - - do { - binder_lock(__func__); - mutex_lock(&binder_deferred_lock); - if (!hlist_empty(&binder_deferred_list)) { - proc = hlist_entry(binder_deferred_list.first, - struct binder_proc, deferred_work_node); - hlist_del_init(&proc->deferred_work_node); - defer = proc->deferred_work; - proc->deferred_work = 0; - } else { - proc = NULL; - defer = 0; - } - mutex_unlock(&binder_deferred_lock); - - files = NULL; - if (defer & BINDER_DEFERRED_PUT_FILES) { - files = proc->files; - if (files) - proc->files = NULL; - } - - if (defer & BINDER_DEFERRED_FLUSH) - binder_deferred_flush(proc); - - if (defer & BINDER_DEFERRED_RELEASE) - binder_deferred_release(proc); /* frees proc */ - - binder_unlock(__func__); - if (files) - put_files_struct(files); - } while (proc); -} -static DECLARE_WORK(binder_deferred_work, binder_deferred_func); - -static void -binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer) -{ - mutex_lock(&binder_deferred_lock); - proc->deferred_work |= defer; - if (hlist_unhashed(&proc->deferred_work_node)) { - hlist_add_head(&proc->deferred_work_node, - &binder_deferred_list); - queue_work(binder_deferred_workqueue, &binder_deferred_work); - } - mutex_unlock(&binder_deferred_lock); -} - -static void print_binder_transaction(struct seq_file *m, const char *prefix, - struct binder_transaction *t) -{ - seq_printf(m, - "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d", - prefix, t->debug_id, t, - t->from ? t->from->proc->pid : 0, - t->from ? t->from->pid : 0, - t->to_proc ? t->to_proc->pid : 0, - t->to_thread ? t->to_thread->pid : 0, - t->code, t->flags, t->priority, t->need_reply); - if (t->buffer == NULL) { - seq_puts(m, " buffer free\n"); - return; - } - if (t->buffer->target_node) - seq_printf(m, " node %d", - t->buffer->target_node->debug_id); - seq_printf(m, " size %zd:%zd data %p\n", - t->buffer->data_size, t->buffer->offsets_size, - t->buffer->data); -} - -static void print_binder_buffer(struct seq_file *m, const char *prefix, - struct binder_buffer *buffer) -{ - seq_printf(m, "%s %d: %p size %zd:%zd %s\n", - prefix, buffer->debug_id, buffer->data, - buffer->data_size, buffer->offsets_size, - buffer->transaction ? "active" : "delivered"); -} - -static void print_binder_work(struct seq_file *m, const char *prefix, - const char *transaction_prefix, - struct binder_work *w) -{ - struct binder_node *node; - struct binder_transaction *t; - - switch (w->type) { - case BINDER_WORK_TRANSACTION: - t = container_of(w, struct binder_transaction, work); - print_binder_transaction(m, transaction_prefix, t); - break; - case BINDER_WORK_TRANSACTION_COMPLETE: - seq_printf(m, "%stransaction complete\n", prefix); - break; - case BINDER_WORK_NODE: - node = container_of(w, struct binder_node, work); - seq_printf(m, "%snode work %d: u%016llx c%016llx\n", - prefix, node->debug_id, - (u64)node->ptr, (u64)node->cookie); - break; - case BINDER_WORK_DEAD_BINDER: - seq_printf(m, "%shas dead binder\n", prefix); - break; - case BINDER_WORK_DEAD_BINDER_AND_CLEAR: - seq_printf(m, "%shas cleared dead binder\n", prefix); - break; - case BINDER_WORK_CLEAR_DEATH_NOTIFICATION: - seq_printf(m, "%shas cleared death notification\n", prefix); - break; - default: - seq_printf(m, "%sunknown work: type %d\n", prefix, w->type); - break; - } -} - -static void print_binder_thread(struct seq_file *m, - struct binder_thread *thread, - int print_always) -{ - struct binder_transaction *t; - struct binder_work *w; - size_t start_pos = m->count; - size_t header_pos; - - seq_printf(m, " thread %d: l %02x\n", thread->pid, thread->looper); - header_pos = m->count; - t = thread->transaction_stack; - while (t) { - if (t->from == thread) { - print_binder_transaction(m, - " outgoing transaction", t); - t = t->from_parent; - } else if (t->to_thread == thread) { - print_binder_transaction(m, - " incoming transaction", t); - t = t->to_parent; - } else { - print_binder_transaction(m, " bad transaction", t); - t = NULL; - } - } - list_for_each_entry(w, &thread->todo, entry) { - print_binder_work(m, " ", " pending transaction", w); - } - if (!print_always && m->count == header_pos) - m->count = start_pos; -} - -static void print_binder_node(struct seq_file *m, struct binder_node *node) -{ - struct binder_ref *ref; - struct binder_work *w; - int count; - - count = 0; - hlist_for_each_entry(ref, &node->refs, node_entry) - count++; - - seq_printf(m, " node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d", - node->debug_id, (u64)node->ptr, (u64)node->cookie, - node->has_strong_ref, node->has_weak_ref, - node->local_strong_refs, node->local_weak_refs, - node->internal_strong_refs, count); - if (count) { - seq_puts(m, " proc"); - hlist_for_each_entry(ref, &node->refs, node_entry) - seq_printf(m, " %d", ref->proc->pid); - } - seq_puts(m, "\n"); - list_for_each_entry(w, &node->async_todo, entry) - print_binder_work(m, " ", - " pending async transaction", w); -} - -static void print_binder_ref(struct seq_file *m, struct binder_ref *ref) -{ - seq_printf(m, " ref %d: desc %d %snode %d s %d w %d d %p\n", - ref->debug_id, ref->desc, ref->node->proc ? "" : "dead ", - ref->node->debug_id, ref->strong, ref->weak, ref->death); -} - -static void print_binder_proc(struct seq_file *m, - struct binder_proc *proc, int print_all) -{ - struct binder_work *w; - struct rb_node *n; - size_t start_pos = m->count; - size_t header_pos; - - seq_printf(m, "proc %d\n", proc->pid); - header_pos = m->count; - - for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) - print_binder_thread(m, rb_entry(n, struct binder_thread, - rb_node), print_all); - for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) { - struct binder_node *node = rb_entry(n, struct binder_node, - rb_node); - if (print_all || node->has_async_transaction) - print_binder_node(m, node); - } - if (print_all) { - for (n = rb_first(&proc->refs_by_desc); - n != NULL; - n = rb_next(n)) - print_binder_ref(m, rb_entry(n, struct binder_ref, - rb_node_desc)); - } - for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n)) - print_binder_buffer(m, " buffer", - rb_entry(n, struct binder_buffer, rb_node)); - list_for_each_entry(w, &proc->todo, entry) - print_binder_work(m, " ", " pending transaction", w); - list_for_each_entry(w, &proc->delivered_death, entry) { - seq_puts(m, " has delivered dead binder\n"); - break; - } - if (!print_all && m->count == header_pos) - m->count = start_pos; -} - -static const char * const binder_return_strings[] = { - "BR_ERROR", - "BR_OK", - "BR_TRANSACTION", - "BR_REPLY", - "BR_ACQUIRE_RESULT", - "BR_DEAD_REPLY", - "BR_TRANSACTION_COMPLETE", - "BR_INCREFS", - "BR_ACQUIRE", - "BR_RELEASE", - "BR_DECREFS", - "BR_ATTEMPT_ACQUIRE", - "BR_NOOP", - "BR_SPAWN_LOOPER", - "BR_FINISHED", - "BR_DEAD_BINDER", - "BR_CLEAR_DEATH_NOTIFICATION_DONE", - "BR_FAILED_REPLY" -}; - -static const char * const binder_command_strings[] = { - "BC_TRANSACTION", - "BC_REPLY", - "BC_ACQUIRE_RESULT", - "BC_FREE_BUFFER", - "BC_INCREFS", - "BC_ACQUIRE", - "BC_RELEASE", - "BC_DECREFS", - "BC_INCREFS_DONE", - "BC_ACQUIRE_DONE", - "BC_ATTEMPT_ACQUIRE", - "BC_REGISTER_LOOPER", - "BC_ENTER_LOOPER", - "BC_EXIT_LOOPER", - "BC_REQUEST_DEATH_NOTIFICATION", - "BC_CLEAR_DEATH_NOTIFICATION", - "BC_DEAD_BINDER_DONE" -}; - -static const char * const binder_objstat_strings[] = { - "proc", - "thread", - "node", - "ref", - "death", - "transaction", - "transaction_complete" -}; - -static void print_binder_stats(struct seq_file *m, const char *prefix, - struct binder_stats *stats) -{ - int i; - - BUILD_BUG_ON(ARRAY_SIZE(stats->bc) != - ARRAY_SIZE(binder_command_strings)); - for (i = 0; i < ARRAY_SIZE(stats->bc); i++) { - if (stats->bc[i]) - seq_printf(m, "%s%s: %d\n", prefix, - binder_command_strings[i], stats->bc[i]); - } - - BUILD_BUG_ON(ARRAY_SIZE(stats->br) != - ARRAY_SIZE(binder_return_strings)); - for (i = 0; i < ARRAY_SIZE(stats->br); i++) { - if (stats->br[i]) - seq_printf(m, "%s%s: %d\n", prefix, - binder_return_strings[i], stats->br[i]); - } - - BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != - ARRAY_SIZE(binder_objstat_strings)); - BUILD_BUG_ON(ARRAY_SIZE(stats->obj_created) != - ARRAY_SIZE(stats->obj_deleted)); - for (i = 0; i < ARRAY_SIZE(stats->obj_created); i++) { - if (stats->obj_created[i] || stats->obj_deleted[i]) - seq_printf(m, "%s%s: active %d total %d\n", prefix, - binder_objstat_strings[i], - stats->obj_created[i] - stats->obj_deleted[i], - stats->obj_created[i]); - } -} - -static void print_binder_proc_stats(struct seq_file *m, - struct binder_proc *proc) -{ - struct binder_work *w; - struct rb_node *n; - int count, strong, weak; - - seq_printf(m, "proc %d\n", proc->pid); - count = 0; - for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) - count++; - seq_printf(m, " threads: %d\n", count); - seq_printf(m, " requested threads: %d+%d/%d\n" - " ready threads %d\n" - " free async space %zd\n", proc->requested_threads, - proc->requested_threads_started, proc->max_threads, - proc->ready_threads, proc->free_async_space); - count = 0; - for (n = rb_first(&proc->nodes); n != NULL; n = rb_next(n)) - count++; - seq_printf(m, " nodes: %d\n", count); - count = 0; - strong = 0; - weak = 0; - for (n = rb_first(&proc->refs_by_desc); n != NULL; n = rb_next(n)) { - struct binder_ref *ref = rb_entry(n, struct binder_ref, - rb_node_desc); - count++; - strong += ref->strong; - weak += ref->weak; - } - seq_printf(m, " refs: %d s %d w %d\n", count, strong, weak); - - count = 0; - for (n = rb_first(&proc->allocated_buffers); n != NULL; n = rb_next(n)) - count++; - seq_printf(m, " buffers: %d\n", count); - - count = 0; - list_for_each_entry(w, &proc->todo, entry) { - switch (w->type) { - case BINDER_WORK_TRANSACTION: - count++; - break; - default: - break; - } - } - seq_printf(m, " pending transactions: %d\n", count); - - print_binder_stats(m, " ", &proc->stats); -} - - -static int binder_state_show(struct seq_file *m, void *unused) -{ - struct binder_proc *proc; - struct binder_node *node; - int do_lock = !binder_debug_no_lock; - - if (do_lock) - binder_lock(__func__); - - seq_puts(m, "binder state:\n"); - - if (!hlist_empty(&binder_dead_nodes)) - seq_puts(m, "dead nodes:\n"); - hlist_for_each_entry(node, &binder_dead_nodes, dead_node) - print_binder_node(m, node); - - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc(m, proc, 1); - if (do_lock) - binder_unlock(__func__); - return 0; -} - -static int binder_stats_show(struct seq_file *m, void *unused) -{ - struct binder_proc *proc; - int do_lock = !binder_debug_no_lock; - - if (do_lock) - binder_lock(__func__); - - seq_puts(m, "binder stats:\n"); - - print_binder_stats(m, "", &binder_stats); - - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc_stats(m, proc); - if (do_lock) - binder_unlock(__func__); - return 0; -} - -static int binder_transactions_show(struct seq_file *m, void *unused) -{ - struct binder_proc *proc; - int do_lock = !binder_debug_no_lock; - - if (do_lock) - binder_lock(__func__); - - seq_puts(m, "binder transactions:\n"); - hlist_for_each_entry(proc, &binder_procs, proc_node) - print_binder_proc(m, proc, 0); - if (do_lock) - binder_unlock(__func__); - return 0; -} - -static int binder_proc_show(struct seq_file *m, void *unused) -{ - struct binder_proc *proc = m->private; - int do_lock = !binder_debug_no_lock; - - if (do_lock) - binder_lock(__func__); - seq_puts(m, "binder proc state:\n"); - print_binder_proc(m, proc, 1); - if (do_lock) - binder_unlock(__func__); - return 0; -} - -static void print_binder_transaction_log_entry(struct seq_file *m, - struct binder_transaction_log_entry *e) -{ - seq_printf(m, - "%d: %s from %d:%d to %d:%d node %d handle %d size %d:%d\n", - e->debug_id, (e->call_type == 2) ? "reply" : - ((e->call_type == 1) ? "async" : "call "), e->from_proc, - e->from_thread, e->to_proc, e->to_thread, e->to_node, - e->target_handle, e->data_size, e->offsets_size); -} - -static int binder_transaction_log_show(struct seq_file *m, void *unused) -{ - struct binder_transaction_log *log = m->private; - int i; - - if (log->full) { - for (i = log->next; i < ARRAY_SIZE(log->entry); i++) - print_binder_transaction_log_entry(m, &log->entry[i]); - } - for (i = 0; i < log->next; i++) - print_binder_transaction_log_entry(m, &log->entry[i]); - return 0; -} - -static const struct file_operations binder_fops = { - .owner = THIS_MODULE, - .poll = binder_poll, - .unlocked_ioctl = binder_ioctl, - .compat_ioctl = binder_ioctl, - .mmap = binder_mmap, - .open = binder_open, - .flush = binder_flush, - .release = binder_release, -}; - -static struct miscdevice binder_miscdev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "binder", - .fops = &binder_fops -}; - -BINDER_DEBUG_ENTRY(state); -BINDER_DEBUG_ENTRY(stats); -BINDER_DEBUG_ENTRY(transactions); -BINDER_DEBUG_ENTRY(transaction_log); - -static int __init binder_init(void) -{ - int ret; - - binder_deferred_workqueue = create_singlethread_workqueue("binder"); - if (!binder_deferred_workqueue) - return -ENOMEM; - - binder_debugfs_dir_entry_root = debugfs_create_dir("binder", NULL); - if (binder_debugfs_dir_entry_root) - binder_debugfs_dir_entry_proc = debugfs_create_dir("proc", - binder_debugfs_dir_entry_root); - ret = misc_register(&binder_miscdev); - if (binder_debugfs_dir_entry_root) { - debugfs_create_file("state", - S_IRUGO, - binder_debugfs_dir_entry_root, - NULL, - &binder_state_fops); - debugfs_create_file("stats", - S_IRUGO, - binder_debugfs_dir_entry_root, - NULL, - &binder_stats_fops); - debugfs_create_file("transactions", - S_IRUGO, - binder_debugfs_dir_entry_root, - NULL, - &binder_transactions_fops); - debugfs_create_file("transaction_log", - S_IRUGO, - binder_debugfs_dir_entry_root, - &binder_transaction_log, - &binder_transaction_log_fops); - debugfs_create_file("failed_transaction_log", - S_IRUGO, - binder_debugfs_dir_entry_root, - &binder_transaction_log_failed, - &binder_transaction_log_fops); - } - return ret; -} - -device_initcall(binder_init); - -#define CREATE_TRACE_POINTS -#include "binder_trace.h" - -MODULE_LICENSE("GPL v2"); diff --git a/drivers/staging/android/binder.h b/drivers/staging/android/binder.h deleted file mode 100644 index eb0834656dfe..000000000000 --- a/drivers/staging/android/binder.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2008 Google, Inc. - * - * Based on, but no longer compatible with, the original - * OpenBinder.org binder driver interface, which is: - * - * Copyright (c) 2005 Palmsource, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef _LINUX_BINDER_H -#define _LINUX_BINDER_H - -#ifdef CONFIG_ANDROID_BINDER_IPC_32BIT -#define BINDER_IPC_32BIT 1 -#endif - -#include "uapi/binder.h" - -#endif /* _LINUX_BINDER_H */ - diff --git a/drivers/staging/android/binder_trace.h b/drivers/staging/android/binder_trace.h deleted file mode 100644 index 7f20f3dc8369..000000000000 --- a/drivers/staging/android/binder_trace.h +++ /dev/null @@ -1,329 +0,0 @@ -/* - * Copyright (C) 2012 Google, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM binder - -#if !defined(_BINDER_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _BINDER_TRACE_H - -#include - -struct binder_buffer; -struct binder_node; -struct binder_proc; -struct binder_ref; -struct binder_thread; -struct binder_transaction; - -TRACE_EVENT(binder_ioctl, - TP_PROTO(unsigned int cmd, unsigned long arg), - TP_ARGS(cmd, arg), - - TP_STRUCT__entry( - __field(unsigned int, cmd) - __field(unsigned long, arg) - ), - TP_fast_assign( - __entry->cmd = cmd; - __entry->arg = arg; - ), - TP_printk("cmd=0x%x arg=0x%lx", __entry->cmd, __entry->arg) -); - -DECLARE_EVENT_CLASS(binder_lock_class, - TP_PROTO(const char *tag), - TP_ARGS(tag), - TP_STRUCT__entry( - __field(const char *, tag) - ), - TP_fast_assign( - __entry->tag = tag; - ), - TP_printk("tag=%s", __entry->tag) -); - -#define DEFINE_BINDER_LOCK_EVENT(name) \ -DEFINE_EVENT(binder_lock_class, name, \ - TP_PROTO(const char *func), \ - TP_ARGS(func)) - -DEFINE_BINDER_LOCK_EVENT(binder_lock); -DEFINE_BINDER_LOCK_EVENT(binder_locked); -DEFINE_BINDER_LOCK_EVENT(binder_unlock); - -DECLARE_EVENT_CLASS(binder_function_return_class, - TP_PROTO(int ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(int, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret=%d", __entry->ret) -); - -#define DEFINE_BINDER_FUNCTION_RETURN_EVENT(name) \ -DEFINE_EVENT(binder_function_return_class, name, \ - TP_PROTO(int ret), \ - TP_ARGS(ret)) - -DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done); -DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done); -DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done); - -TRACE_EVENT(binder_wait_for_work, - TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo), - TP_ARGS(proc_work, transaction_stack, thread_todo), - - TP_STRUCT__entry( - __field(bool, proc_work) - __field(bool, transaction_stack) - __field(bool, thread_todo) - ), - TP_fast_assign( - __entry->proc_work = proc_work; - __entry->transaction_stack = transaction_stack; - __entry->thread_todo = thread_todo; - ), - TP_printk("proc_work=%d transaction_stack=%d thread_todo=%d", - __entry->proc_work, __entry->transaction_stack, - __entry->thread_todo) -); - -TRACE_EVENT(binder_transaction, - TP_PROTO(bool reply, struct binder_transaction *t, - struct binder_node *target_node), - TP_ARGS(reply, t, target_node), - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, target_node) - __field(int, to_proc) - __field(int, to_thread) - __field(int, reply) - __field(unsigned int, code) - __field(unsigned int, flags) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->target_node = target_node ? target_node->debug_id : 0; - __entry->to_proc = t->to_proc->pid; - __entry->to_thread = t->to_thread ? t->to_thread->pid : 0; - __entry->reply = reply; - __entry->code = t->code; - __entry->flags = t->flags; - ), - TP_printk("transaction=%d dest_node=%d dest_proc=%d dest_thread=%d reply=%d flags=0x%x code=0x%x", - __entry->debug_id, __entry->target_node, - __entry->to_proc, __entry->to_thread, - __entry->reply, __entry->flags, __entry->code) -); - -TRACE_EVENT(binder_transaction_received, - TP_PROTO(struct binder_transaction *t), - TP_ARGS(t), - - TP_STRUCT__entry( - __field(int, debug_id) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - ), - TP_printk("transaction=%d", __entry->debug_id) -); - -TRACE_EVENT(binder_transaction_node_to_ref, - TP_PROTO(struct binder_transaction *t, struct binder_node *node, - struct binder_ref *ref), - TP_ARGS(t, node, ref), - - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, node_debug_id) - __field(binder_uintptr_t, node_ptr) - __field(int, ref_debug_id) - __field(uint32_t, ref_desc) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->node_debug_id = node->debug_id; - __entry->node_ptr = node->ptr; - __entry->ref_debug_id = ref->debug_id; - __entry->ref_desc = ref->desc; - ), - TP_printk("transaction=%d node=%d src_ptr=0x%016llx ==> dest_ref=%d dest_desc=%d", - __entry->debug_id, __entry->node_debug_id, - (u64)__entry->node_ptr, - __entry->ref_debug_id, __entry->ref_desc) -); - -TRACE_EVENT(binder_transaction_ref_to_node, - TP_PROTO(struct binder_transaction *t, struct binder_ref *ref), - TP_ARGS(t, ref), - - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, ref_debug_id) - __field(uint32_t, ref_desc) - __field(int, node_debug_id) - __field(binder_uintptr_t, node_ptr) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->ref_debug_id = ref->debug_id; - __entry->ref_desc = ref->desc; - __entry->node_debug_id = ref->node->debug_id; - __entry->node_ptr = ref->node->ptr; - ), - TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ptr=0x%016llx", - __entry->debug_id, __entry->node_debug_id, - __entry->ref_debug_id, __entry->ref_desc, - (u64)__entry->node_ptr) -); - -TRACE_EVENT(binder_transaction_ref_to_ref, - TP_PROTO(struct binder_transaction *t, struct binder_ref *src_ref, - struct binder_ref *dest_ref), - TP_ARGS(t, src_ref, dest_ref), - - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, node_debug_id) - __field(int, src_ref_debug_id) - __field(uint32_t, src_ref_desc) - __field(int, dest_ref_debug_id) - __field(uint32_t, dest_ref_desc) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->node_debug_id = src_ref->node->debug_id; - __entry->src_ref_debug_id = src_ref->debug_id; - __entry->src_ref_desc = src_ref->desc; - __entry->dest_ref_debug_id = dest_ref->debug_id; - __entry->dest_ref_desc = dest_ref->desc; - ), - TP_printk("transaction=%d node=%d src_ref=%d src_desc=%d ==> dest_ref=%d dest_desc=%d", - __entry->debug_id, __entry->node_debug_id, - __entry->src_ref_debug_id, __entry->src_ref_desc, - __entry->dest_ref_debug_id, __entry->dest_ref_desc) -); - -TRACE_EVENT(binder_transaction_fd, - TP_PROTO(struct binder_transaction *t, int src_fd, int dest_fd), - TP_ARGS(t, src_fd, dest_fd), - - TP_STRUCT__entry( - __field(int, debug_id) - __field(int, src_fd) - __field(int, dest_fd) - ), - TP_fast_assign( - __entry->debug_id = t->debug_id; - __entry->src_fd = src_fd; - __entry->dest_fd = dest_fd; - ), - TP_printk("transaction=%d src_fd=%d ==> dest_fd=%d", - __entry->debug_id, __entry->src_fd, __entry->dest_fd) -); - -DECLARE_EVENT_CLASS(binder_buffer_class, - TP_PROTO(struct binder_buffer *buf), - TP_ARGS(buf), - TP_STRUCT__entry( - __field(int, debug_id) - __field(size_t, data_size) - __field(size_t, offsets_size) - ), - TP_fast_assign( - __entry->debug_id = buf->debug_id; - __entry->data_size = buf->data_size; - __entry->offsets_size = buf->offsets_size; - ), - TP_printk("transaction=%d data_size=%zd offsets_size=%zd", - __entry->debug_id, __entry->data_size, __entry->offsets_size) -); - -DEFINE_EVENT(binder_buffer_class, binder_transaction_alloc_buf, - TP_PROTO(struct binder_buffer *buffer), - TP_ARGS(buffer)); - -DEFINE_EVENT(binder_buffer_class, binder_transaction_buffer_release, - TP_PROTO(struct binder_buffer *buffer), - TP_ARGS(buffer)); - -DEFINE_EVENT(binder_buffer_class, binder_transaction_failed_buffer_release, - TP_PROTO(struct binder_buffer *buffer), - TP_ARGS(buffer)); - -TRACE_EVENT(binder_update_page_range, - TP_PROTO(struct binder_proc *proc, bool allocate, - void *start, void *end), - TP_ARGS(proc, allocate, start, end), - TP_STRUCT__entry( - __field(int, proc) - __field(bool, allocate) - __field(size_t, offset) - __field(size_t, size) - ), - TP_fast_assign( - __entry->proc = proc->pid; - __entry->allocate = allocate; - __entry->offset = start - proc->buffer; - __entry->size = end - start; - ), - TP_printk("proc=%d allocate=%d offset=%zu size=%zu", - __entry->proc, __entry->allocate, - __entry->offset, __entry->size) -); - -TRACE_EVENT(binder_command, - TP_PROTO(uint32_t cmd), - TP_ARGS(cmd), - TP_STRUCT__entry( - __field(uint32_t, cmd) - ), - TP_fast_assign( - __entry->cmd = cmd; - ), - TP_printk("cmd=0x%x %s", - __entry->cmd, - _IOC_NR(__entry->cmd) < ARRAY_SIZE(binder_command_strings) ? - binder_command_strings[_IOC_NR(__entry->cmd)] : - "unknown") -); - -TRACE_EVENT(binder_return, - TP_PROTO(uint32_t cmd), - TP_ARGS(cmd), - TP_STRUCT__entry( - __field(uint32_t, cmd) - ), - TP_fast_assign( - __entry->cmd = cmd; - ), - TP_printk("cmd=0x%x %s", - __entry->cmd, - _IOC_NR(__entry->cmd) < ARRAY_SIZE(binder_return_strings) ? - binder_return_strings[_IOC_NR(__entry->cmd)] : - "unknown") -); - -#endif /* _BINDER_TRACE_H */ - -#undef TRACE_INCLUDE_PATH -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE binder_trace -#include diff --git a/drivers/staging/android/uapi/binder.h b/drivers/staging/android/uapi/binder.h deleted file mode 100644 index dba4cef3a8d3..000000000000 --- a/drivers/staging/android/uapi/binder.h +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Copyright (C) 2008 Google, Inc. - * - * Based on, but no longer compatible with, the original - * OpenBinder.org binder driver interface, which is: - * - * Copyright (c) 2005 Palmsource, Inc. - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef _UAPI_LINUX_BINDER_H -#define _UAPI_LINUX_BINDER_H - -#include - -#define B_PACK_CHARS(c1, c2, c3, c4) \ - ((((c1)<<24)) | (((c2)<<16)) | (((c3)<<8)) | (c4)) -#define B_TYPE_LARGE 0x85 - -enum { - BINDER_TYPE_BINDER = B_PACK_CHARS('s', 'b', '*', B_TYPE_LARGE), - BINDER_TYPE_WEAK_BINDER = B_PACK_CHARS('w', 'b', '*', B_TYPE_LARGE), - BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE), - BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE), - BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE), -}; - -enum { - FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, - FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, -}; - -#ifdef BINDER_IPC_32BIT -typedef __u32 binder_size_t; -typedef __u32 binder_uintptr_t; -#else -typedef __u64 binder_size_t; -typedef __u64 binder_uintptr_t; -#endif - -/* - * This is the flattened representation of a Binder object for transfer - * between processes. The 'offsets' supplied as part of a binder transaction - * contains offsets into the data where these structures occur. The Binder - * driver takes care of re-writing the structure type and data as it moves - * between processes. - */ -struct flat_binder_object { - /* 8 bytes for large_flat_header. */ - __u32 type; - __u32 flags; - - /* 8 bytes of data. */ - union { - binder_uintptr_t binder; /* local object */ - __u32 handle; /* remote object */ - }; - - /* extra data associated with local object */ - binder_uintptr_t cookie; -}; - -/* - * On 64-bit platforms where user code may run in 32-bits the driver must - * translate the buffer (and local binder) addresses appropriately. - */ - -struct binder_write_read { - binder_size_t write_size; /* bytes to write */ - binder_size_t write_consumed; /* bytes consumed by driver */ - binder_uintptr_t write_buffer; - binder_size_t read_size; /* bytes to read */ - binder_size_t read_consumed; /* bytes consumed by driver */ - binder_uintptr_t read_buffer; -}; - -/* Use with BINDER_VERSION, driver fills in fields. */ -struct binder_version { - /* driver protocol version -- increment with incompatible change */ - __s32 protocol_version; -}; - -/* This is the current protocol version. */ -#ifdef BINDER_IPC_32BIT -#define BINDER_CURRENT_PROTOCOL_VERSION 7 -#else -#define BINDER_CURRENT_PROTOCOL_VERSION 8 -#endif - -#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) -#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) -#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) -#define BINDER_SET_IDLE_PRIORITY _IOW('b', 6, __s32) -#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32) -#define BINDER_THREAD_EXIT _IOW('b', 8, __s32) -#define BINDER_VERSION _IOWR('b', 9, struct binder_version) - -/* - * NOTE: Two special error codes you should check for when calling - * in to the driver are: - * - * EINTR -- The operation has been interupted. This should be - * handled by retrying the ioctl() until a different error code - * is returned. - * - * ECONNREFUSED -- The driver is no longer accepting operations - * from your process. That is, the process is being destroyed. - * You should handle this by exiting from your process. Note - * that once this error code is returned, all further calls to - * the driver from any thread will return this same code. - */ - -enum transaction_flags { - TF_ONE_WAY = 0x01, /* this is a one-way call: async, no return */ - TF_ROOT_OBJECT = 0x04, /* contents are the component's root object */ - TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */ - TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */ -}; - -struct binder_transaction_data { - /* The first two are only used for bcTRANSACTION and brTRANSACTION, - * identifying the target and contents of the transaction. - */ - union { - /* target descriptor of command transaction */ - __u32 handle; - /* target descriptor of return transaction */ - binder_uintptr_t ptr; - } target; - binder_uintptr_t cookie; /* target object cookie */ - __u32 code; /* transaction command */ - - /* General information about the transaction. */ - __u32 flags; - pid_t sender_pid; - uid_t sender_euid; - binder_size_t data_size; /* number of bytes of data */ - binder_size_t offsets_size; /* number of bytes of offsets */ - - /* If this transaction is inline, the data immediately - * follows here; otherwise, it ends with a pointer to - * the data buffer. - */ - union { - struct { - /* transaction data */ - binder_uintptr_t buffer; - /* offsets from buffer to flat_binder_object structs */ - binder_uintptr_t offsets; - } ptr; - __u8 buf[8]; - } data; -}; - -struct binder_ptr_cookie { - binder_uintptr_t ptr; - binder_uintptr_t cookie; -}; - -struct binder_handle_cookie { - __u32 handle; - binder_uintptr_t cookie; -} __packed; - -struct binder_pri_desc { - __s32 priority; - __u32 desc; -}; - -struct binder_pri_ptr_cookie { - __s32 priority; - binder_uintptr_t ptr; - binder_uintptr_t cookie; -}; - -enum binder_driver_return_protocol { - BR_ERROR = _IOR('r', 0, __s32), - /* - * int: error code - */ - - BR_OK = _IO('r', 1), - /* No parameters! */ - - BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data), - BR_REPLY = _IOR('r', 3, struct binder_transaction_data), - /* - * binder_transaction_data: the received command. - */ - - BR_ACQUIRE_RESULT = _IOR('r', 4, __s32), - /* - * not currently supported - * int: 0 if the last bcATTEMPT_ACQUIRE was not successful. - * Else the remote object has acquired a primary reference. - */ - - BR_DEAD_REPLY = _IO('r', 5), - /* - * The target of the last transaction (either a bcTRANSACTION or - * a bcATTEMPT_ACQUIRE) is no longer with us. No parameters. - */ - - BR_TRANSACTION_COMPLETE = _IO('r', 6), - /* - * No parameters... always refers to the last transaction requested - * (including replies). Note that this will be sent even for - * asynchronous transactions. - */ - - BR_INCREFS = _IOR('r', 7, struct binder_ptr_cookie), - BR_ACQUIRE = _IOR('r', 8, struct binder_ptr_cookie), - BR_RELEASE = _IOR('r', 9, struct binder_ptr_cookie), - BR_DECREFS = _IOR('r', 10, struct binder_ptr_cookie), - /* - * void *: ptr to binder - * void *: cookie for binder - */ - - BR_ATTEMPT_ACQUIRE = _IOR('r', 11, struct binder_pri_ptr_cookie), - /* - * not currently supported - * int: priority - * void *: ptr to binder - * void *: cookie for binder - */ - - BR_NOOP = _IO('r', 12), - /* - * No parameters. Do nothing and examine the next command. It exists - * primarily so that we can replace it with a BR_SPAWN_LOOPER command. - */ - - BR_SPAWN_LOOPER = _IO('r', 13), - /* - * No parameters. The driver has determined that a process has no - * threads waiting to service incoming transactions. When a process - * receives this command, it must spawn a new service thread and - * register it via bcENTER_LOOPER. - */ - - BR_FINISHED = _IO('r', 14), - /* - * not currently supported - * stop threadpool thread - */ - - BR_DEAD_BINDER = _IOR('r', 15, binder_uintptr_t), - /* - * void *: cookie - */ - BR_CLEAR_DEATH_NOTIFICATION_DONE = _IOR('r', 16, binder_uintptr_t), - /* - * void *: cookie - */ - - BR_FAILED_REPLY = _IO('r', 17), - /* - * The the last transaction (either a bcTRANSACTION or - * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters. - */ -}; - -enum binder_driver_command_protocol { - BC_TRANSACTION = _IOW('c', 0, struct binder_transaction_data), - BC_REPLY = _IOW('c', 1, struct binder_transaction_data), - /* - * binder_transaction_data: the sent command. - */ - - BC_ACQUIRE_RESULT = _IOW('c', 2, __s32), - /* - * not currently supported - * int: 0 if the last BR_ATTEMPT_ACQUIRE was not successful. - * Else you have acquired a primary reference on the object. - */ - - BC_FREE_BUFFER = _IOW('c', 3, binder_uintptr_t), - /* - * void *: ptr to transaction data received on a read - */ - - BC_INCREFS = _IOW('c', 4, __u32), - BC_ACQUIRE = _IOW('c', 5, __u32), - BC_RELEASE = _IOW('c', 6, __u32), - BC_DECREFS = _IOW('c', 7, __u32), - /* - * int: descriptor - */ - - BC_INCREFS_DONE = _IOW('c', 8, struct binder_ptr_cookie), - BC_ACQUIRE_DONE = _IOW('c', 9, struct binder_ptr_cookie), - /* - * void *: ptr to binder - * void *: cookie for binder - */ - - BC_ATTEMPT_ACQUIRE = _IOW('c', 10, struct binder_pri_desc), - /* - * not currently supported - * int: priority - * int: descriptor - */ - - BC_REGISTER_LOOPER = _IO('c', 11), - /* - * No parameters. - * Register a spawned looper thread with the device. - */ - - BC_ENTER_LOOPER = _IO('c', 12), - BC_EXIT_LOOPER = _IO('c', 13), - /* - * No parameters. - * These two commands are sent as an application-level thread - * enters and exits the binder loop, respectively. They are - * used so the binder can have an accurate count of the number - * of looping threads it has available. - */ - - BC_REQUEST_DEATH_NOTIFICATION = _IOW('c', 14, - struct binder_handle_cookie), - /* - * int: handle - * void *: cookie - */ - - BC_CLEAR_DEATH_NOTIFICATION = _IOW('c', 15, - struct binder_handle_cookie), - /* - * int: handle - * void *: cookie - */ - - BC_DEAD_BINDER_DONE = _IOW('c', 16, binder_uintptr_t), - /* - * void *: cookie - */ -}; - -#endif /* _UAPI_LINUX_BINDER_H */ - diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 6cad97485bad..5f17fa7bc153 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -1,4 +1,5 @@ # UAPI Header export list +header-y += android/ header-y += byteorder/ header-y += can/ header-y += caif/ diff --git a/include/uapi/linux/android/Kbuild b/include/uapi/linux/android/Kbuild new file mode 100644 index 000000000000..ca011eec252a --- /dev/null +++ b/include/uapi/linux/android/Kbuild @@ -0,0 +1,2 @@ +# UAPI Header export list +header-y += binder.h diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h new file mode 100644 index 000000000000..dba4cef3a8d3 --- /dev/null +++ b/include/uapi/linux/android/binder.h @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2008 Google, Inc. + * + * Based on, but no longer compatible with, the original + * OpenBinder.org binder driver interface, which is: + * + * Copyright (c) 2005 Palmsource, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _UAPI_LINUX_BINDER_H +#define _UAPI_LINUX_BINDER_H + +#include + +#define B_PACK_CHARS(c1, c2, c3, c4) \ + ((((c1)<<24)) | (((c2)<<16)) | (((c3)<<8)) | (c4)) +#define B_TYPE_LARGE 0x85 + +enum { + BINDER_TYPE_BINDER = B_PACK_CHARS('s', 'b', '*', B_TYPE_LARGE), + BINDER_TYPE_WEAK_BINDER = B_PACK_CHARS('w', 'b', '*', B_TYPE_LARGE), + BINDER_TYPE_HANDLE = B_PACK_CHARS('s', 'h', '*', B_TYPE_LARGE), + BINDER_TYPE_WEAK_HANDLE = B_PACK_CHARS('w', 'h', '*', B_TYPE_LARGE), + BINDER_TYPE_FD = B_PACK_CHARS('f', 'd', '*', B_TYPE_LARGE), +}; + +enum { + FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, + FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, +}; + +#ifdef BINDER_IPC_32BIT +typedef __u32 binder_size_t; +typedef __u32 binder_uintptr_t; +#else +typedef __u64 binder_size_t; +typedef __u64 binder_uintptr_t; +#endif + +/* + * This is the flattened representation of a Binder object for transfer + * between processes. The 'offsets' supplied as part of a binder transaction + * contains offsets into the data where these structures occur. The Binder + * driver takes care of re-writing the structure type and data as it moves + * between processes. + */ +struct flat_binder_object { + /* 8 bytes for large_flat_header. */ + __u32 type; + __u32 flags; + + /* 8 bytes of data. */ + union { + binder_uintptr_t binder; /* local object */ + __u32 handle; /* remote object */ + }; + + /* extra data associated with local object */ + binder_uintptr_t cookie; +}; + +/* + * On 64-bit platforms where user code may run in 32-bits the driver must + * translate the buffer (and local binder) addresses appropriately. + */ + +struct binder_write_read { + binder_size_t write_size; /* bytes to write */ + binder_size_t write_consumed; /* bytes consumed by driver */ + binder_uintptr_t write_buffer; + binder_size_t read_size; /* bytes to read */ + binder_size_t read_consumed; /* bytes consumed by driver */ + binder_uintptr_t read_buffer; +}; + +/* Use with BINDER_VERSION, driver fills in fields. */ +struct binder_version { + /* driver protocol version -- increment with incompatible change */ + __s32 protocol_version; +}; + +/* This is the current protocol version. */ +#ifdef BINDER_IPC_32BIT +#define BINDER_CURRENT_PROTOCOL_VERSION 7 +#else +#define BINDER_CURRENT_PROTOCOL_VERSION 8 +#endif + +#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) +#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) +#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) +#define BINDER_SET_IDLE_PRIORITY _IOW('b', 6, __s32) +#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32) +#define BINDER_THREAD_EXIT _IOW('b', 8, __s32) +#define BINDER_VERSION _IOWR('b', 9, struct binder_version) + +/* + * NOTE: Two special error codes you should check for when calling + * in to the driver are: + * + * EINTR -- The operation has been interupted. This should be + * handled by retrying the ioctl() until a different error code + * is returned. + * + * ECONNREFUSED -- The driver is no longer accepting operations + * from your process. That is, the process is being destroyed. + * You should handle this by exiting from your process. Note + * that once this error code is returned, all further calls to + * the driver from any thread will return this same code. + */ + +enum transaction_flags { + TF_ONE_WAY = 0x01, /* this is a one-way call: async, no return */ + TF_ROOT_OBJECT = 0x04, /* contents are the component's root object */ + TF_STATUS_CODE = 0x08, /* contents are a 32-bit status code */ + TF_ACCEPT_FDS = 0x10, /* allow replies with file descriptors */ +}; + +struct binder_transaction_data { + /* The first two are only used for bcTRANSACTION and brTRANSACTION, + * identifying the target and contents of the transaction. + */ + union { + /* target descriptor of command transaction */ + __u32 handle; + /* target descriptor of return transaction */ + binder_uintptr_t ptr; + } target; + binder_uintptr_t cookie; /* target object cookie */ + __u32 code; /* transaction command */ + + /* General information about the transaction. */ + __u32 flags; + pid_t sender_pid; + uid_t sender_euid; + binder_size_t data_size; /* number of bytes of data */ + binder_size_t offsets_size; /* number of bytes of offsets */ + + /* If this transaction is inline, the data immediately + * follows here; otherwise, it ends with a pointer to + * the data buffer. + */ + union { + struct { + /* transaction data */ + binder_uintptr_t buffer; + /* offsets from buffer to flat_binder_object structs */ + binder_uintptr_t offsets; + } ptr; + __u8 buf[8]; + } data; +}; + +struct binder_ptr_cookie { + binder_uintptr_t ptr; + binder_uintptr_t cookie; +}; + +struct binder_handle_cookie { + __u32 handle; + binder_uintptr_t cookie; +} __packed; + +struct binder_pri_desc { + __s32 priority; + __u32 desc; +}; + +struct binder_pri_ptr_cookie { + __s32 priority; + binder_uintptr_t ptr; + binder_uintptr_t cookie; +}; + +enum binder_driver_return_protocol { + BR_ERROR = _IOR('r', 0, __s32), + /* + * int: error code + */ + + BR_OK = _IO('r', 1), + /* No parameters! */ + + BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data), + BR_REPLY = _IOR('r', 3, struct binder_transaction_data), + /* + * binder_transaction_data: the received command. + */ + + BR_ACQUIRE_RESULT = _IOR('r', 4, __s32), + /* + * not currently supported + * int: 0 if the last bcATTEMPT_ACQUIRE was not successful. + * Else the remote object has acquired a primary reference. + */ + + BR_DEAD_REPLY = _IO('r', 5), + /* + * The target of the last transaction (either a bcTRANSACTION or + * a bcATTEMPT_ACQUIRE) is no longer with us. No parameters. + */ + + BR_TRANSACTION_COMPLETE = _IO('r', 6), + /* + * No parameters... always refers to the last transaction requested + * (including replies). Note that this will be sent even for + * asynchronous transactions. + */ + + BR_INCREFS = _IOR('r', 7, struct binder_ptr_cookie), + BR_ACQUIRE = _IOR('r', 8, struct binder_ptr_cookie), + BR_RELEASE = _IOR('r', 9, struct binder_ptr_cookie), + BR_DECREFS = _IOR('r', 10, struct binder_ptr_cookie), + /* + * void *: ptr to binder + * void *: cookie for binder + */ + + BR_ATTEMPT_ACQUIRE = _IOR('r', 11, struct binder_pri_ptr_cookie), + /* + * not currently supported + * int: priority + * void *: ptr to binder + * void *: cookie for binder + */ + + BR_NOOP = _IO('r', 12), + /* + * No parameters. Do nothing and examine the next command. It exists + * primarily so that we can replace it with a BR_SPAWN_LOOPER command. + */ + + BR_SPAWN_LOOPER = _IO('r', 13), + /* + * No parameters. The driver has determined that a process has no + * threads waiting to service incoming transactions. When a process + * receives this command, it must spawn a new service thread and + * register it via bcENTER_LOOPER. + */ + + BR_FINISHED = _IO('r', 14), + /* + * not currently supported + * stop threadpool thread + */ + + BR_DEAD_BINDER = _IOR('r', 15, binder_uintptr_t), + /* + * void *: cookie + */ + BR_CLEAR_DEATH_NOTIFICATION_DONE = _IOR('r', 16, binder_uintptr_t), + /* + * void *: cookie + */ + + BR_FAILED_REPLY = _IO('r', 17), + /* + * The the last transaction (either a bcTRANSACTION or + * a bcATTEMPT_ACQUIRE) failed (e.g. out of memory). No parameters. + */ +}; + +enum binder_driver_command_protocol { + BC_TRANSACTION = _IOW('c', 0, struct binder_transaction_data), + BC_REPLY = _IOW('c', 1, struct binder_transaction_data), + /* + * binder_transaction_data: the sent command. + */ + + BC_ACQUIRE_RESULT = _IOW('c', 2, __s32), + /* + * not currently supported + * int: 0 if the last BR_ATTEMPT_ACQUIRE was not successful. + * Else you have acquired a primary reference on the object. + */ + + BC_FREE_BUFFER = _IOW('c', 3, binder_uintptr_t), + /* + * void *: ptr to transaction data received on a read + */ + + BC_INCREFS = _IOW('c', 4, __u32), + BC_ACQUIRE = _IOW('c', 5, __u32), + BC_RELEASE = _IOW('c', 6, __u32), + BC_DECREFS = _IOW('c', 7, __u32), + /* + * int: descriptor + */ + + BC_INCREFS_DONE = _IOW('c', 8, struct binder_ptr_cookie), + BC_ACQUIRE_DONE = _IOW('c', 9, struct binder_ptr_cookie), + /* + * void *: ptr to binder + * void *: cookie for binder + */ + + BC_ATTEMPT_ACQUIRE = _IOW('c', 10, struct binder_pri_desc), + /* + * not currently supported + * int: priority + * int: descriptor + */ + + BC_REGISTER_LOOPER = _IO('c', 11), + /* + * No parameters. + * Register a spawned looper thread with the device. + */ + + BC_ENTER_LOOPER = _IO('c', 12), + BC_EXIT_LOOPER = _IO('c', 13), + /* + * No parameters. + * These two commands are sent as an application-level thread + * enters and exits the binder loop, respectively. They are + * used so the binder can have an accurate count of the number + * of looping threads it has available. + */ + + BC_REQUEST_DEATH_NOTIFICATION = _IOW('c', 14, + struct binder_handle_cookie), + /* + * int: handle + * void *: cookie + */ + + BC_CLEAR_DEATH_NOTIFICATION = _IOW('c', 15, + struct binder_handle_cookie), + /* + * int: handle + * void *: cookie + */ + + BC_DEAD_BINDER_DONE = _IOW('c', 16, binder_uintptr_t), + /* + * void *: cookie + */ +}; + +#endif /* _UAPI_LINUX_BINDER_H */ + -- cgit v1.2.3 From bc2d62a01b336983b8abd1cb2337f305352f29ef Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 16 Oct 2014 15:22:30 +0200 Subject: android: uapi: binder.h add types.h to .h file to pass the checker scripts, and provide a proper uapi .h file. Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/android/binder.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index dba4cef3a8d3..41420e341e75 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -20,6 +20,7 @@ #ifndef _UAPI_LINUX_BINDER_H #define _UAPI_LINUX_BINDER_H +#include #include #define B_PACK_CHARS(c1, c2, c3, c4) \ -- cgit v1.2.3 From 988568669d171774b96e59fe35ef575df7f8cffd Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Mon, 20 Oct 2014 13:20:45 +0300 Subject: cfg80211: Specify frame and reason code for NL80211_CMD_DEL_STATION The optional NL80211_ATTR_MGMT_SUBTYPE and NL80211_ATTR_REASON_CODE attributes can now be included in NL80211_CMD_DEL_STATION to indicate to the driver which frame (Deauthentication/Disassociation) and reason code in that frame should be used to indicate removal to the specific station. This is used by drivers that implement AP SME and generate those frames internally. Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ include/uapi/linux/nl80211.h | 6 +++++- net/wireless/nl80211.c | 21 +++++++++++++++++++++ net/wireless/trace.h | 10 ++++++++-- 4 files changed, 39 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ebb69f671979..ed896c0b5b8b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -804,9 +804,14 @@ struct station_parameters { * Used to delete a station entry (or all stations). * * @mac: MAC address of the station to remove or NULL to remove all stations + * @subtype: Management frame subtype to use for indicating removal + * (10 = Disassociation, 12 = Deauthentication) + * @reason_code: Reason code for the Disassociation/Deauthentication frame */ struct station_del_parameters { const u8 *mac; + u8 subtype; + u16 reason_code; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 846071b0cde9..b553c48404d3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -227,7 +227,11 @@ * the interface identified by %NL80211_ATTR_IFINDEX. * @NL80211_CMD_DEL_STATION: Remove a station identified by %NL80211_ATTR_MAC * or, if no MAC address given, all stations, on the interface identified - * by %NL80211_ATTR_IFINDEX. + * by %NL80211_ATTR_IFINDEX. %NL80211_ATTR_MGMT_SUBTYPE and + * %NL80211_ATTR_REASON_CODE can optionally be used to specify which type + * of disconnection indication should be sent to the station + * (Deauthentication or Disassociation frame and reason code for that + * frame). * * @NL80211_CMD_GET_MPATH: Get mesh path attributes for mesh path to * destination %NL80211_ATTR_MAC on the interface identified by diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 40cf7b937926..0c0f2045e1f8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4414,6 +4414,27 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->del_station) return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_MGMT_SUBTYPE]) { + params.subtype = + nla_get_u8(info->attrs[NL80211_ATTR_MGMT_SUBTYPE]); + if (params.subtype != IEEE80211_STYPE_DISASSOC >> 4 && + params.subtype != IEEE80211_STYPE_DEAUTH >> 4) + return -EINVAL; + } else { + /* Default to Deauthentication frame */ + params.subtype = IEEE80211_STYPE_DEAUTH >> 4; + } + + if (info->attrs[NL80211_ATTR_REASON_CODE]) { + params.reason_code = + nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); + if (params.reason_code == 0) + return -EINVAL; /* 0 is reserved */ + } else { + /* Default to reason code 2 */ + params.reason_code = WLAN_REASON_PREV_AUTH_NOT_VALID; + } + return rdev_del_station(rdev, dev, ¶ms); } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index b1339400631d..cdb2c2ef1ae1 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -688,14 +688,20 @@ DECLARE_EVENT_CLASS(station_del, WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) + __field(u8, subtype) + __field(u16, reason_code) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, params->mac); + __entry->subtype = params->subtype; + __entry->reason_code = params->reason_code; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac)) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT + ", subtype: %u, reason_code: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), + __entry->subtype, __entry->reason_code) ); DEFINE_EVENT(station_del, rdev_del_station, -- cgit v1.2.3 From f974008f07a62171a9dede08250c9a35c2b2b986 Mon Sep 17 00:00:00 2001 From: Olivier Gay Date: Sat, 18 Oct 2014 01:53:39 +0200 Subject: HID: add keyboard input assist hid usages Add keyboard input assist controls usages from approved hid usage table request HUTTR42: http://www.usb.org/developers/hidpage/HUTRR42c.pdf Signed-off-by: Olivier Gay Acked-by: Dmitry Torokhov Signed-off-by: Jiri Kosina --- drivers/hid/hid-debug.c | 6 ++++++ drivers/hid/hid-input.c | 7 +++++++ include/uapi/linux/input.h | 7 +++++++ 3 files changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 84c3cb15ccdd..8bf61d295ffd 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -946,6 +946,12 @@ static const char *keys[KEY_MAX + 1] = { [KEY_BRIGHTNESS_MIN] = "BrightnessMin", [KEY_BRIGHTNESS_MAX] = "BrightnessMax", [KEY_BRIGHTNESS_AUTO] = "BrightnessAuto", + [KEY_KBDINPUTASSIST_PREV] = "KbdInputAssistPrev", + [KEY_KBDINPUTASSIST_NEXT] = "KbdInputAssistNext", + [KEY_KBDINPUTASSIST_PREVGROUP] = "KbdInputAssistPrevGroup", + [KEY_KBDINPUTASSIST_NEXTGROUP] = "KbdInputAssistNextGroup", + [KEY_KBDINPUTASSIST_ACCEPT] = "KbdInputAssistAccept", + [KEY_KBDINPUTASSIST_CANCEL] = "KbdInputAssistCancel", }; static const char *relatives[REL_MAX + 1] = { diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 2df7fddbd119..56c6c30f2c7d 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -862,6 +862,13 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x28b: map_key_clear(KEY_FORWARDMAIL); break; case 0x28c: map_key_clear(KEY_SEND); break; + case 0x2c7: map_key_clear(KEY_KBDINPUTASSIST_PREV); break; + case 0x2c8: map_key_clear(KEY_KBDINPUTASSIST_NEXT); break; + case 0x2c9: map_key_clear(KEY_KBDINPUTASSIST_PREVGROUP); break; + case 0x2ca: map_key_clear(KEY_KBDINPUTASSIST_NEXTGROUP); break; + case 0x2cb: map_key_clear(KEY_KBDINPUTASSIST_ACCEPT); break; + case 0x2cc: map_key_clear(KEY_KBDINPUTASSIST_CANCEL); break; + default: goto ignore; } break; diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 1874ebe9ac1e..a1d7e931ab72 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -739,6 +739,13 @@ struct input_keymap_entry { #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ +#define KEY_KBDINPUTASSIST_PREV 0x260 +#define KEY_KBDINPUTASSIST_NEXT 0x261 +#define KEY_KBDINPUTASSIST_PREVGROUP 0x262 +#define KEY_KBDINPUTASSIST_NEXTGROUP 0x263 +#define KEY_KBDINPUTASSIST_ACCEPT 0x264 +#define KEY_KBDINPUTASSIST_CANCEL 0x265 + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 -- cgit v1.2.3 From 723e73acd16d2ea08cdbd8b449b7bc69389b94d4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 22 Oct 2014 09:25:06 +0200 Subject: cfg80211: make WMM TSPEC support flag an nl80211 feature flag During the review of the corresponding wpa_supplicant patches we noticed that the only way for it to detect that this functionality is supported currently is to check for the command support. This can be misleading though, as the command was also designed to, in the future, support pure 802.11 TSPECs. Expose the WMM-TSPEC feature flag to nl80211 so later we can also expose an 802.11-TSPEC feature flag (if needed) to differentiate the two cases. Note: this change isn't needed in 3.18 as there's no driver there yet that supports the functionality at all. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 +----- include/uapi/linux/nl80211.h | 5 +++++ net/wireless/nl80211.c | 14 ++++++-------- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ed896c0b5b8b..77aa805d7e7c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2646,13 +2646,9 @@ struct cfg80211_ops { * @WIPHY_FLAG_SUPPORTS_5_10_MHZ: Device supports 5 MHz and 10 MHz channels. * @WIPHY_FLAG_HAS_CHANNEL_SWITCH: Device supports channel switch in * beaconing mode (AP, IBSS, Mesh, ...). - * @WIPHY_FLAG_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM - * TSPEC sessions (TID aka TSID 0-7) with the NL80211_CMD_ADD_TX_TS - * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it - * needs to be able to handle Block-Ack agreements and other things. */ enum wiphy_flags { - WIPHY_FLAG_SUPPORTS_WMM_ADMISSION = BIT(0), + /* use hole at 0 */ /* use hole at 1 */ /* use hole at 2 */ WIPHY_FLAG_NETNS_OK = BIT(3), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index b553c48404d3..be1d5def304d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4052,6 +4052,10 @@ enum nl80211_ap_sme_features { * multiplexing powersave, ie. can turn off all but one chain * and then wake the rest up as required after, for example, * rts/cts handshake. + * @NL80211_FEATURE_SUPPORTS_WMM_ADMISSION: the device supports setting up WMM + * TSPEC sessions (TID aka TSID 0-7) with the %NL80211_CMD_ADD_TX_TS + * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it + * needs to be able to handle Block-Ack agreements and other things. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4080,6 +4084,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_ACKTO_ESTIMATION = 1 << 23, NL80211_FEATURE_STATIC_SMPS = 1 << 24, NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, + NL80211_FEATURE_SUPPORTS_WMM_ADMISSION = 1 << 26, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d05fe6d6481d..d98d4ea27819 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1514,8 +1514,8 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (rdev->wiphy.flags & WIPHY_FLAG_HAS_CHANNEL_SWITCH) CMD(channel_switch, CHANNEL_SWITCH); CMD(set_qos_map, SET_QOS_MAP); - if (rdev->wiphy.flags & - WIPHY_FLAG_SUPPORTS_WMM_ADMISSION) + if (rdev->wiphy.features & + NL80211_FEATURE_SUPPORTS_WMM_ADMISSION) CMD(add_tx_ts, ADD_TX_TS); } /* add into the if now */ @@ -9557,7 +9557,7 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) u16 admitted_time = 0; int err; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) + if (!(rdev->wiphy.features & NL80211_FEATURE_SUPPORTS_WMM_ADMISSION)) return -EOPNOTSUPP; if (!info->attrs[NL80211_ATTR_TSID] || !info->attrs[NL80211_ATTR_MAC] || @@ -9573,12 +9573,10 @@ static int nl80211_add_tx_ts(struct sk_buff *skb, struct genl_info *info) return -EINVAL; /* WMM uses TIDs 0-7 even for TSPEC */ - if (tsid < IEEE80211_FIRST_TSPEC_TSID) { - if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_WMM_ADMISSION)) - return -EINVAL; - } else { + if (tsid >= IEEE80211_FIRST_TSPEC_TSID) { /* TODO: handle 802.11 TSPEC/admission control - * need more attributes for that (e.g. BA session requirement) + * need more attributes for that (e.g. BA session requirement); + * change the WMM adminssion test above to allow both then */ return -EINVAL; } -- cgit v1.2.3 From 0d7a855526dd672e114aff2ac22b60fc6f155b08 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:37 +0200 Subject: vfs: add RENAME_WHITEOUT This adds a new RENAME_WHITEOUT flag. This flag makes rename() create a whiteout of source. The whiteout creation is atomic relative to the rename. Signed-off-by: Miklos Szeredi --- fs/namei.c | 8 ++++++-- include/uapi/linux/fs.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/fs/namei.c b/fs/namei.c index d20191c0ebf5..42df664e95e5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4209,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ca1a11bb4443..3735fa0a6784 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -37,6 +37,7 @@ #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ struct fstrim_range { __u64 start; -- cgit v1.2.3 From 607ec6a5abb637642e5b8199828f39ceae83cfb7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 24 Oct 2014 08:57:01 -0200 Subject: Revert "[media] v4l2-dv-timings: fix a sparse warning" Sparse got a fix for that. Also, it is suspected that reverting this patch might cause compilation breakages on userspace. So, revert it. This reverts commit 5c2cacc1028917168b0f7650008dceaa6f7e3fe2. Requested-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-dv-timings.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 6a0764c89fcb..6c8f159e416e 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -21,8 +21,17 @@ #ifndef _V4L2_DV_TIMINGS_H #define _V4L2_DV_TIMINGS_H +#if __GNUC__ < 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ < 6)) +/* Sadly gcc versions older than 4.6 have a bug in how they initialize + anonymous unions where they require additional curly brackets. + This violates the C1x standard. This workaround adds the curly brackets + if needed. */ #define V4L2_INIT_BT_TIMINGS(_width, args...) \ { .bt = { _width , ## args } } +#else +#define V4L2_INIT_BT_TIMINGS(_width, args...) \ + .bt = { _width , ## args } +#endif /* CEA-861-E timings (i.e. standard HDTV timings) */ -- cgit v1.2.3 From e8f479b11268af3f206d1580f6b0d572d6ecb4f7 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 22 Oct 2014 12:23:05 -0700 Subject: cfg80211: support configuring vif mac addr on create This is useful when creating virtual interfaces. Keeps udev from mucking with things it shouldn't, since the default MAC is never seen by udev when specified on the cmd-line during creation. Signed-off-by: Ben Greear [check for feature flag in nl80211 to force drivers to set it] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 ++++++--- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 4 +++- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 39d7996b0609..f67948e18600 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -319,9 +319,12 @@ struct ieee80211_supported_band { /** * struct vif_params - describes virtual interface parameters * @use_4addr: use 4-address frames - * @macaddr: address to use for this virtual interface. This will only - * be used for non-netdevice interfaces. If this parameter is set - * to zero address the driver may determine the address as needed. + * @macaddr: address to use for this virtual interface. + * If this parameter is set to zero address the driver may + * determine the address as needed. + * This feature is only fully supported by drivers that enable the + * %NL80211_FEATURE_MAC_ON_CREATE flag. Others may support creating + ** only p2p devices with specified MAC. */ struct vif_params { int use_4addr; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index be1d5def304d..f7daae59248e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4056,6 +4056,9 @@ enum nl80211_ap_sme_features { * TSPEC sessions (TID aka TSID 0-7) with the %NL80211_CMD_ADD_TX_TS * command. Standard IEEE 802.11 TSPEC setup is not yet supported, it * needs to be able to handle Block-Ack agreements and other things. + * @NL80211_FEATURE_MAC_ON_CREATE: Device supports configuring + * the vif's MAC address upon creation. + * See 'macaddr' field in the vif_params (cfg80211.h). */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4085,6 +4088,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_STATIC_SMPS = 1 << 24, NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, NL80211_FEATURE_SUPPORTS_WMM_ADMISSION = 1 << 26, + NL80211_FEATURE_MAC_ON_CREATE = 1 << 27, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d98d4ea27819..12736a7cd506 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,7 +2605,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) !(rdev->wiphy.interface_modes & (1 << type))) return -EOPNOTSUPP; - if (type == NL80211_IFTYPE_P2P_DEVICE && info->attrs[NL80211_ATTR_MAC]) { + if ((type == NL80211_IFTYPE_P2P_DEVICE || + rdev->wiphy.features & NL80211_FEATURE_MAC_ON_CREATE) && + info->attrs[NL80211_ATTR_MAC]) { nla_memcpy(params.macaddr, info->attrs[NL80211_ATTR_MAC], ETH_ALEN); if (!is_valid_ether_addr(params.macaddr)) -- cgit v1.2.3 From e9105f1bead4ec3f64904564c7c6268185d6b363 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Fri, 17 Oct 2014 12:39:09 +0200 Subject: netfilter: nf_tables: add new expression nft_redir This new expression provides NAT in the redirect flavour, which is to redirect packets to local machine. Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_redir.h | 21 +++++++ include/uapi/linux/netfilter/nf_tables.h | 16 ++++++ net/ipv4/netfilter/Kconfig | 9 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nft_redir_ipv4.c | 77 +++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 9 +++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nft_redir_ipv6.c | 77 +++++++++++++++++++++++++ net/netfilter/Kconfig | 9 +++ net/netfilter/Makefile | 1 + net/netfilter/nft_redir.c | 98 ++++++++++++++++++++++++++++++++ 11 files changed, 319 insertions(+) create mode 100644 include/net/netfilter/nft_redir.h create mode 100644 net/ipv4/netfilter/nft_redir_ipv4.c create mode 100644 net/ipv6/netfilter/nft_redir_ipv6.c create mode 100644 net/netfilter/nft_redir.c (limited to 'include/uapi') diff --git a/include/net/netfilter/nft_redir.h b/include/net/netfilter/nft_redir.h new file mode 100644 index 000000000000..a2d67546afab --- /dev/null +++ b/include/net/netfilter/nft_redir.h @@ -0,0 +1,21 @@ +#ifndef _NFT_REDIR_H_ +#define _NFT_REDIR_H_ + +struct nft_redir { + enum nft_registers sreg_proto_min:8; + enum nft_registers sreg_proto_max:8; + u16 flags; +}; + +extern const struct nla_policy nft_redir_policy[]; + +int nft_redir_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]); + +int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr); + +int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data); + +#endif /* _NFT_REDIR_H_ */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index f31fe7b660a5..16f62a5cf04d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -837,6 +837,22 @@ enum nft_masq_attributes { }; #define NFTA_MASQ_MAX (__NFTA_MASQ_MAX - 1) +/** + * enum nft_redir_attributes - nf_tables redirect expression netlink attributes + * + * @NFTA_REDIR_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers) + * @NFTA_REDIR_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers) + * @NFTA_REDIR_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32) + */ +enum nft_redir_attributes { + NFTA_REDIR_UNSPEC, + NFTA_REDIR_REG_PROTO_MIN, + NFTA_REDIR_REG_PROTO_MAX, + NFTA_REDIR_FLAGS, + __NFTA_REDIR_MAX +}; +#define NFTA_REDIR_MAX (__NFTA_REDIR_MAX - 1) + /** * enum nft_gen_attributes - nf_tables ruleset generation attributes * diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index a300e2c32b26..8358b2da1549 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -119,6 +119,15 @@ config NFT_MASQ_IPV4 This is the expression that provides IPv4 masquerading support for nf_tables. +config NFT_REDIR_IPV4 + tristate "IPv4 redirect support for nf_tables" + depends on NF_TABLES_IPV4 + depends on NFT_REDIR + select NF_NAT_REDIRECT_IPV4 + help + This is the expression that provides IPv4 redirect support for + nf_tables. + config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" depends on NF_CONNTRACK_SNMP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 34e436c92015..902bcd1597bb 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o +obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c new file mode 100644 index 000000000000..643c5967aa27 --- /dev/null +++ b/net/ipv4/netfilter/nft_redir_ipv4.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_redir_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_ipv4_multi_range_compat mr; + unsigned int verdict; + + memset(&mr, 0, sizeof(mr)); + if (priv->sreg_proto_min) { + mr.range[0].min.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + mr.range[0].max.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + mr.range[0].flags |= priv->flags; + + verdict = nf_nat_redirect_ipv4(pkt->skb, &mr, pkt->ops->hooknum); + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_redir_ipv4_type; +static const struct nft_expr_ops nft_redir_ipv4_ops = { + .type = &nft_redir_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_ipv4_eval, + .init = nft_redir_init, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "redir", + .ops = &nft_redir_ipv4_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_ipv4_module_init(void) +{ + return nft_register_expr(&nft_redir_ipv4_type); +} + +static void __exit nft_redir_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_redir_ipv4_type); +} + +module_init(nft_redir_ipv4_module_init); +module_exit(nft_redir_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 462eebbf4377..0dbe5c7953e5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -97,6 +97,15 @@ config NFT_MASQ_IPV6 This is the expression that provides IPv4 masquerading support for nf_tables. +config NFT_REDIR_IPV6 + tristate "IPv6 redirect support for nf_tables" + depends on NF_TABLES_IPV6 + depends on NFT_REDIR + select NF_NAT_REDIRECT_IPV6 + help + This is the expression that provides IPv4 redirect support for + nf_tables. + endif # NF_NAT_IPV6 config IP6_NF_IPTABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 6c2baab10f21..d2ac9f5f212c 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o +obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c new file mode 100644 index 000000000000..83420eeaad1c --- /dev/null +++ b/net/ipv6/netfilter/nft_redir_ipv6.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nft_redir_ipv6_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_redir *priv = nft_expr_priv(expr); + struct nf_nat_range range; + unsigned int verdict; + + memset(&range, 0, sizeof(range)); + if (priv->sreg_proto_min) { + range.min_proto.all = (__force __be16) + data[priv->sreg_proto_min].data[0]; + range.max_proto.all = (__force __be16) + data[priv->sreg_proto_max].data[0]; + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + range.flags |= priv->flags; + + verdict = nf_nat_redirect_ipv6(pkt->skb, &range, pkt->ops->hooknum); + data[NFT_REG_VERDICT].verdict = verdict; +} + +static struct nft_expr_type nft_redir_ipv6_type; +static const struct nft_expr_ops nft_redir_ipv6_ops = { + .type = &nft_redir_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), + .eval = nft_redir_ipv6_eval, + .init = nft_redir_init, + .dump = nft_redir_dump, + .validate = nft_redir_validate, +}; + +static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "redir", + .ops = &nft_redir_ipv6_ops, + .policy = nft_redir_policy, + .maxattr = NFTA_REDIR_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_redir_ipv6_module_init(void) +{ + return nft_register_expr(&nft_redir_ipv6_type); +} + +static void __exit nft_redir_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_redir_ipv6_type); +} + +module_init(nft_redir_ipv6_module_init); +module_exit(nft_redir_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 49deb4edbac6..373486ae4159 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -505,6 +505,15 @@ config NFT_MASQ This option adds the "masquerade" expression that you can use to perform NAT in the masquerade flavour. +config NFT_REDIR + depends on NF_TABLES + depends on NF_CONNTRACK + depends on NF_NAT + tristate "Netfilter nf_tables redirect support" + help + This options adds the "redirect" expression that you can use + to perform NAT in the redirect flavour. + config NFT_NAT depends on NF_TABLES depends on NF_CONNTRACK diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index a9571be3f791..f3eb4680f2ec 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -88,6 +88,7 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o +obj-$(CONFIG_NFT_REDIR) += nft_redir.o # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c new file mode 100644 index 000000000000..e27b4e35718a --- /dev/null +++ b/net/netfilter/nft_redir.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014 Arturo Borrero Gonzalez + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { + [NFTA_REDIR_REG_PROTO_MIN] = { .type = NLA_U32 }, + [NFTA_REDIR_REG_PROTO_MAX] = { .type = NLA_U32 }, + [NFTA_REDIR_FLAGS] = { .type = NLA_U32 }, +}; +EXPORT_SYMBOL_GPL(nft_redir_policy); + +int nft_redir_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_redir *priv = nft_expr_priv(expr); + u32 nla_be32; + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + + if (tb[NFTA_REDIR_REG_PROTO_MIN]) { + nla_be32 = nla_get_be32(tb[NFTA_REDIR_REG_PROTO_MIN]); + priv->sreg_proto_min = ntohl(nla_be32); + err = nft_validate_input_register(priv->sreg_proto_min); + if (err < 0) + return err; + + if (tb[NFTA_REDIR_REG_PROTO_MAX]) { + nla_be32 = nla_get_be32(tb[NFTA_REDIR_REG_PROTO_MAX]); + priv->sreg_proto_max = ntohl(nla_be32); + err = nft_validate_input_register(priv->sreg_proto_max); + if (err < 0) + return err; + } else { + priv->sreg_proto_max = priv->sreg_proto_min; + } + } + + if (tb[NFTA_REDIR_FLAGS]) { + priv->flags = ntohl(nla_get_be32(tb[NFTA_REDIR_FLAGS])); + if (priv->flags & ~NF_NAT_RANGE_MASK) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_redir_init); + +int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_redir *priv = nft_expr_priv(expr); + + if (priv->sreg_proto_min) { + if (nla_put_be32(skb, NFTA_REDIR_REG_PROTO_MIN, + htonl(priv->sreg_proto_min))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_REDIR_REG_PROTO_MAX, + htonl(priv->sreg_proto_max))) + goto nla_put_failure; + } + + if (priv->flags != 0 && + nla_put_be32(skb, NFTA_REDIR_FLAGS, htonl(priv->flags))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} +EXPORT_SYMBOL_GPL(nft_redir_dump); + +int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); +} +EXPORT_SYMBOL_GPL(nft_redir_validate); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arturo Borrero Gonzalez "); -- cgit v1.2.3 From 958501163ddd6ea22a98f94fa0e7ce6d4734e5c4 Mon Sep 17 00:00:00 2001 From: Kyeyoon Park Date: Thu, 23 Oct 2014 14:49:17 -0700 Subject: bridge: Add support for IEEE 802.11 Proxy ARP This feature is defined in IEEE Std 802.11-2012, 10.23.13. It allows the AP devices to keep track of the hardware-address-to-IP-address mapping of the mobile devices within the WLAN network. The AP will learn this mapping via observing DHCP, ARP, and NS/NA frames. When a request for such information is made (i.e. ARP request, Neighbor Solicitation), the AP will respond on behalf of the associated mobile device. In the process of doing so, the AP will drop the multicast request frame that was intended to go out to the wireless medium. It was recommended at the LKS workshop to do this implementation in the bridge layer. vxlan.c is already doing something very similar. The DHCP snooping code will be added to the userspace application (hostapd) per the recommendation. This RFC commit is only for IPv4. A similar approach in the bridge layer will be taken for IPv6 as well. Signed-off-by: Kyeyoon Park Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 5 ++++ net/bridge/br_input.c | 60 ++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_netlink.c | 4 ++- net/bridge/br_private.h | 1 + net/bridge/br_sysfs_if.c | 2 ++ 6 files changed, 72 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 0bdb77e16875..7072d8325016 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -243,6 +243,7 @@ enum { IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ IFLA_BRPORT_LEARNING, /* mac learning */ IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 992ec49a96aa..1510b54e6a2e 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -183,6 +183,11 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood unicast traffic to ports that turn it off */ if (unicast && !(p->flags & BR_FLOOD)) continue; + + /* Do not flood to ports that enable proxy ARP */ + if (p->flags & BR_PROXYARP) + continue; + prev = maybe_deliver(prev, p, skb, __packet_hook); if (IS_ERR(prev)) goto out; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6fd5522df696..1f1de715197c 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include "br_private.h" @@ -57,6 +59,60 @@ static int br_pass_frame_up(struct sk_buff *skb) netif_receive_skb); } +static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid) +{ + struct net_device *dev = br->dev; + struct neighbour *n; + struct arphdr *parp; + u8 *arpptr, *sha; + __be32 sip, tip; + + if (dev->flags & IFF_NOARP) + return; + + if (!pskb_may_pull(skb, arp_hdr_len(dev))) { + dev->stats.tx_dropped++; + return; + } + parp = arp_hdr(skb); + + if (parp->ar_pro != htons(ETH_P_IP) || + parp->ar_op != htons(ARPOP_REQUEST) || + parp->ar_hln != dev->addr_len || + parp->ar_pln != 4) + return; + + arpptr = (u8 *)parp + sizeof(struct arphdr); + sha = arpptr; + arpptr += dev->addr_len; /* sha */ + memcpy(&sip, arpptr, sizeof(sip)); + arpptr += sizeof(sip); + arpptr += dev->addr_len; /* tha */ + memcpy(&tip, arpptr, sizeof(tip)); + + if (ipv4_is_loopback(tip) || + ipv4_is_multicast(tip)) + return; + + n = neigh_lookup(&arp_tbl, &tip, dev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = __br_fdb_get(br, n->ha, vid); + if (f) + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, + sha, n->ha, sha); + + neigh_release(n); + } +} + /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct sk_buff *skb) { @@ -98,6 +154,10 @@ int br_handle_frame_finish(struct sk_buff *skb) dst = NULL; if (is_broadcast_ether_addr(dest)) { + if (p->flags & BR_PROXYARP && + skb->protocol == htons(ETH_P_ARP)) + br_do_proxy_arp(skb, br, vid); + skb2 = skb; unicast = false; } else if (is_multicast_ether_addr(dest)) { diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ff9706647f2..86c239b06f6e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -60,7 +60,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_PROTECT, !!(p->flags & BR_ROOT_BLOCK)) || nla_put_u8(skb, IFLA_BRPORT_FAST_LEAVE, !!(p->flags & BR_MULTICAST_FAST_LEAVE)) || nla_put_u8(skb, IFLA_BRPORT_LEARNING, !!(p->flags & BR_LEARNING)) || - nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD))) + nla_put_u8(skb, IFLA_BRPORT_UNICAST_FLOOD, !!(p->flags & BR_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP))) return -EMSGSIZE; return 0; @@ -332,6 +333,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK); br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING); br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); + br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); if (tb[IFLA_BRPORT_COST]) { err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST])); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4d783d071305..8f3f08140258 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -172,6 +172,7 @@ struct net_bridge_port #define BR_FLOOD 0x00000040 #define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING) #define BR_PROMISC 0x00000080 +#define BR_PROXYARP 0x00000100 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING struct bridge_mcast_own_query ip4_own_query; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index e561cd59b8a6..2de5d91199e8 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -170,6 +170,7 @@ BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD); BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK); BRPORT_ATTR_FLAG(learning, BR_LEARNING); BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); +BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -213,6 +214,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_router, &brport_attr_multicast_fast_leave, #endif + &brport_attr_proxyarp, NULL }; -- cgit v1.2.3 From fcd964dda5ece2fa77f78f843bc3455348787282 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Tue, 7 Oct 2014 17:29:07 +0800 Subject: sched: Update comments for CLONE_NEWNS Signed-off-by: Chen Hanxiao Acked-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1412674147-8941-1-git-send-email-chenhanxiao@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 34f9d7387d13..b932be9f5c5b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -13,7 +13,7 @@ #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ #define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_NEWNS 0x00020000 /* New mount namespace group */ #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ -- cgit v1.2.3 From b438b1ab35507bbccc28d13f0b8286ffcf24019d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Oct 2014 22:16:36 -0700 Subject: perf: Fix typos in sample code in the perf_event.h header struct perf_event_mmap_page has members called "index" and "cap_user_rdpmc". Spell them correctly in the examples. Signed-off-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/320ba26391a8123cc16e5f02d24d34bd404332fd.1412313343.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9269de254874..9d845404d875 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -364,7 +364,7 @@ struct perf_event_mmap_page { /* * Bits needed to read the hw events in user-space. * - * u32 seq, time_mult, time_shift, idx, width; + * u32 seq, time_mult, time_shift, index, width; * u64 count, enabled, running; * u64 cyc, time_offset; * s64 pmc = 0; @@ -383,11 +383,11 @@ struct perf_event_mmap_page { * time_shift = pc->time_shift; * } * - * idx = pc->index; + * index = pc->index; * count = pc->offset; - * if (pc->cap_usr_rdpmc && idx) { + * if (pc->cap_user_rdpmc && index) { * width = pc->pmc_width; - * pmc = rdpmc(idx - 1); + * pmc = rdpmc(index - 1); * } * * barrier(); @@ -415,7 +415,7 @@ struct perf_event_mmap_page { }; /* - * If cap_usr_rdpmc this field provides the bit-width of the value + * If cap_user_rdpmc this field provides the bit-width of the value * read using the rdpmc() or equivalent instruction. This can be used * to sign extend the result like: * @@ -439,10 +439,10 @@ struct perf_event_mmap_page { * * Where time_offset,time_mult,time_shift and cyc are read in the * seqcount loop described above. This delta can then be added to - * enabled and possible running (if idx), improving the scaling: + * enabled and possible running (if index), improving the scaling: * * enabled += delta; - * if (idx) + * if (index) * running += delta; * * quot = count / running; -- cgit v1.2.3 From 7202da8b7f7131d25411d81aa557e28cd941c5b6 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 27 Oct 2014 11:37:36 +0200 Subject: ethtool, net/mlx4_en: Cable info, get_module_info/eeprom ethtool support Added support for get_module_info/get_module_eeprom ethtool support for cable info reading. Added new cable types enum in include/uapi/linux/ethtool.h for ethtool use. +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 83 +++++++++++++++++++++++++ include/uapi/linux/ethtool.h | 4 ++ 2 files changed, 87 insertions(+) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index ae83da9cd18a..279f4233de59 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1309,6 +1310,86 @@ static int mlx4_en_set_tunable(struct net_device *dev, return ret; } +static int mlx4_en_get_module_info(struct net_device *dev, + struct ethtool_modinfo *modinfo) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int ret; + u8 data[4]; + + /* Read first 2 bytes to get Module & REV ID */ + ret = mlx4_get_module_info(mdev->dev, priv->port, + 0/*offset*/, 2/*size*/, data); + if (ret < 2) + return -EIO; + + switch (data[0] /* identifier */) { + case MLX4_MODULE_ID_QSFP: + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; + break; + case MLX4_MODULE_ID_QSFP_PLUS: + if (data[1] >= 0x3) { /* revision id */ + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN; + } else { + modinfo->type = ETH_MODULE_SFF_8436; + modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; + } + break; + case MLX4_MODULE_ID_QSFP28: + modinfo->type = ETH_MODULE_SFF_8636; + modinfo->eeprom_len = ETH_MODULE_SFF_8636_LEN; + break; + case MLX4_MODULE_ID_SFP: + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + break; + default: + return -ENOSYS; + } + + return 0; +} + +static int mlx4_en_get_module_eeprom(struct net_device *dev, + struct ethtool_eeprom *ee, + u8 *data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int offset = ee->offset; + int i = 0, ret; + + if (ee->len == 0) + return -EINVAL; + + memset(data, 0, ee->len); + + while (i < ee->len) { + en_dbg(DRV, priv, + "mlx4_get_module_info i(%d) offset(%d) len(%d)\n", + i, offset, ee->len - i); + + ret = mlx4_get_module_info(mdev->dev, priv->port, + offset, ee->len - i, data + i); + + if (!ret) /* Done reading */ + return 0; + + if (ret < 0) { + en_err(priv, + "mlx4_get_module_info i(%d) offset(%d) bytes_to_read(%d) - FAILED (0x%x)\n", + i, offset, ee->len - i, ret); + return 0; + } + + i += ret; + offset += ret; + } + return 0; +} const struct ethtool_ops mlx4_en_ethtool_ops = { .get_drvinfo = mlx4_en_get_drvinfo, @@ -1341,6 +1422,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = { .get_priv_flags = mlx4_en_get_priv_flags, .get_tunable = mlx4_en_get_tunable, .set_tunable = mlx4_en_set_tunable, + .get_module_info = mlx4_en_get_module_info, + .get_module_eeprom = mlx4_en_get_module_eeprom }; diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 99b43056a6fe..b6acd78b821c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1343,6 +1343,10 @@ enum ethtool_sfeatures_retval_bits { #define ETH_MODULE_SFF_8079_LEN 256 #define ETH_MODULE_SFF_8472 0x2 #define ETH_MODULE_SFF_8472_LEN 512 +#define ETH_MODULE_SFF_8636 0x3 +#define ETH_MODULE_SFF_8636_LEN 256 +#define ETH_MODULE_SFF_8436 0x4 +#define ETH_MODULE_SFF_8436_LEN 256 /* Reset flags */ /* The reset() operation must clear the flags for the components which -- cgit v1.2.3 From dcf972a334dd76975bf144ca57350c1f3132c947 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 27 Oct 2014 11:37:39 +0200 Subject: ethtool, net/mlx4_en: Add 100M, 20G, 56G speeds ethtool reporting support Added 100M, 20G and 56G ethtool speed reporting support. Update mlx4_en_test_speed self test with the new speeds. Defined new link speeds in include/uapi/linux/ethtool.h: +#define SPEED_20000 20000 +#define SPEED_40000 40000 +#define SPEED_56000 56000 Signed-off-by: Saeed Mahameed Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_port.c | 15 ++++++++++++--- drivers/net/ethernet/mellanox/mlx4/en_port.h | 9 ++++++--- drivers/net/ethernet/mellanox/mlx4/en_selftest.c | 12 ++++++++---- include/uapi/linux/ethtool.h | 14 +++++++++++++- 4 files changed, 39 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.c b/drivers/net/ethernet/mellanox/mlx4/en_port.c index 0a0261d128b9..afd303662d17 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_port.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.c @@ -91,15 +91,24 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) * already synchronized, no need in locking */ state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK); switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) { + case MLX4_EN_100M_SPEED: + state->link_speed = SPEED_100; + break; case MLX4_EN_1G_SPEED: - state->link_speed = 1000; + state->link_speed = SPEED_1000; break; case MLX4_EN_10G_SPEED_XAUI: case MLX4_EN_10G_SPEED_XFI: - state->link_speed = 10000; + state->link_speed = SPEED_10000; + break; + case MLX4_EN_20G_SPEED: + state->link_speed = SPEED_20000; break; case MLX4_EN_40G_SPEED: - state->link_speed = 40000; + state->link_speed = SPEED_40000; + break; + case MLX4_EN_56G_SPEED: + state->link_speed = SPEED_56000; break; default: state->link_speed = -1; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_port.h b/drivers/net/ethernet/mellanox/mlx4/en_port.h index 745090b49d9e..a5fc93bfe5da 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_port.h +++ b/drivers/net/ethernet/mellanox/mlx4/en_port.h @@ -54,10 +54,13 @@ enum { }; enum { - MLX4_EN_1G_SPEED = 0x02, - MLX4_EN_10G_SPEED_XFI = 0x01, + MLX4_EN_100M_SPEED = 0x04, MLX4_EN_10G_SPEED_XAUI = 0x00, + MLX4_EN_10G_SPEED_XFI = 0x01, + MLX4_EN_1G_SPEED = 0x02, + MLX4_EN_20G_SPEED = 0x08, MLX4_EN_40G_SPEED = 0x40, + MLX4_EN_56G_SPEED = 0x20, MLX4_EN_OTHER_SPEED = 0x0f, }; @@ -68,7 +71,7 @@ struct mlx4_en_query_port_context { __be16 mtu; u8 reserved2; u8 link_speed; -#define MLX4_EN_SPEED_MASK 0x43 +#define MLX4_EN_SPEED_MASK 0x6f u16 reserved3[5]; __be64 mac; u8 transceiver; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c index 49d5afc7cfb8..2d8ee66138e8 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_selftest.c @@ -129,11 +129,15 @@ static int mlx4_en_test_speed(struct mlx4_en_priv *priv) if (mlx4_en_QUERY_PORT(priv->mdev, priv->port)) return -ENOMEM; - /* The device supports 1G, 10G and 40G speeds */ - if (priv->port_state.link_speed != 1000 && - priv->port_state.link_speed != 10000 && - priv->port_state.link_speed != 40000) + /* The device supports 100M, 1G, 10G, 20G, 40G and 56G speed */ + if (priv->port_state.link_speed != SPEED_100 && + priv->port_state.link_speed != SPEED_1000 && + priv->port_state.link_speed != SPEED_10000 && + priv->port_state.link_speed != SPEED_20000 && + priv->port_state.link_speed != SPEED_40000 && + priv->port_state.link_speed != SPEED_56000) return priv->port_state.link_speed; + return 0; } diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b6acd78b821c..eb2095b42fbb 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1213,6 +1213,10 @@ enum ethtool_sfeatures_retval_bits { #define SUPPORTED_40000baseCR4_Full (1 << 24) #define SUPPORTED_40000baseSR4_Full (1 << 25) #define SUPPORTED_40000baseLR4_Full (1 << 26) +#define SUPPORTED_56000baseKR4_Full (1 << 27) +#define SUPPORTED_56000baseCR4_Full (1 << 28) +#define SUPPORTED_56000baseSR4_Full (1 << 29) +#define SUPPORTED_56000baseLR4_Full (1 << 30) #define ADVERTISED_10baseT_Half (1 << 0) #define ADVERTISED_10baseT_Full (1 << 1) @@ -1241,6 +1245,10 @@ enum ethtool_sfeatures_retval_bits { #define ADVERTISED_40000baseCR4_Full (1 << 24) #define ADVERTISED_40000baseSR4_Full (1 << 25) #define ADVERTISED_40000baseLR4_Full (1 << 26) +#define ADVERTISED_56000baseKR4_Full (1 << 27) +#define ADVERTISED_56000baseCR4_Full (1 << 28) +#define ADVERTISED_56000baseSR4_Full (1 << 29) +#define ADVERTISED_56000baseLR4_Full (1 << 30) /* The following are all involved in forcing a particular link * mode for the device for setting things. When getting the @@ -1248,12 +1256,16 @@ enum ethtool_sfeatures_retval_bits { * it was forced up into this mode or autonegotiated. */ -/* The forced speed, 10Mb, 100Mb, gigabit, 2.5Gb, 10GbE. */ +/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|10|20|40|56]GbE. */ #define SPEED_10 10 #define SPEED_100 100 #define SPEED_1000 1000 #define SPEED_2500 2500 #define SPEED_10000 10000 +#define SPEED_20000 20000 +#define SPEED_40000 40000 +#define SPEED_56000 56000 + #define SPEED_UNKNOWN -1 /* Duplex, half or full. */ -- cgit v1.2.3 From f533ccb61edf008e14c9e1b91b48cd2c7397f33d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 29 Oct 2014 08:22:05 +0100 Subject: ALSA: doc: Fix uapi/sound/compress_offload.h kerneldoc comments so that make htmldocs works properly. Since kerneldoc can't handle noname enum properly, name enum sndrv_compress_encoder. Signed-off-by: Takashi Iwai --- include/uapi/sound/compress_offload.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/compress_offload.h b/include/uapi/sound/compress_offload.h index 1964026b5e09..22ed8cb7800b 100644 --- a/include/uapi/sound/compress_offload.h +++ b/include/uapi/sound/compress_offload.h @@ -32,7 +32,7 @@ #define SNDRV_COMPRESS_VERSION SNDRV_PROTOCOL_VERSION(0, 1, 2) /** - * struct snd_compressed_buffer: compressed buffer + * struct snd_compressed_buffer - compressed buffer * @fragment_size: size of buffer fragment in bytes * @fragments: number of such fragments */ @@ -42,7 +42,7 @@ struct snd_compressed_buffer { } __attribute__((packed, aligned(4))); /** - * struct snd_compr_params: compressed stream params + * struct snd_compr_params - compressed stream params * @buffer: buffer description * @codec: codec parameters * @no_wake_mode: dont wake on fragment elapsed @@ -54,7 +54,7 @@ struct snd_compr_params { } __attribute__((packed, aligned(4))); /** - * struct snd_compr_tstamp: timestamp descriptor + * struct snd_compr_tstamp - timestamp descriptor * @byte_offset: Byte offset in ring buffer to DSP * @copied_total: Total number of bytes copied from/to ring buffer to/by DSP * @pcm_frames: Frames decoded or encoded by DSP. This field will evolve by @@ -73,7 +73,7 @@ struct snd_compr_tstamp { } __attribute__((packed, aligned(4))); /** - * struct snd_compr_avail: avail descriptor + * struct snd_compr_avail - avail descriptor * @avail: Number of bytes available in ring buffer for writing/reading * @tstamp: timestamp infomation */ @@ -88,7 +88,7 @@ enum snd_compr_direction { }; /** - * struct snd_compr_caps: caps descriptor + * struct snd_compr_caps - caps descriptor * @codecs: pointer to array of codecs * @direction: direction supported. Of type snd_compr_direction * @min_fragment_size: minimum fragment supported by DSP @@ -110,7 +110,7 @@ struct snd_compr_caps { } __attribute__((packed, aligned(4))); /** - * struct snd_compr_codec_caps: query capability of codec + * struct snd_compr_codec_caps - query capability of codec * @codec: codec for which capability is queried * @num_descriptors: number of codec descriptors * @descriptor: array of codec capability descriptor @@ -122,18 +122,19 @@ struct snd_compr_codec_caps { } __attribute__((packed, aligned(4))); /** + * enum sndrv_compress_encoder * @SNDRV_COMPRESS_ENCODER_PADDING: no of samples appended by the encoder at the * end of the track * @SNDRV_COMPRESS_ENCODER_DELAY: no of samples inserted by the encoder at the * beginning of the track */ -enum { +enum sndrv_compress_encoder { SNDRV_COMPRESS_ENCODER_PADDING = 1, SNDRV_COMPRESS_ENCODER_DELAY = 2, }; /** - * struct snd_compr_metadata: compressed stream metadata + * struct snd_compr_metadata - compressed stream metadata * @key: key id * @value: key value */ -- cgit v1.2.3 From 7fd2561e4ebdd070ebba6d3326c4c5b13942323f Mon Sep 17 00:00:00 2001 From: Erik Kline Date: Tue, 28 Oct 2014 18:11:14 +0900 Subject: net: ipv6: Add a sysctl to make optimistic addresses useful candidates Add a sysctl that causes an interface's optimistic addresses to be considered equivalent to other non-deprecated addresses for source address selection purposes. Preferred addresses will still take precedence over optimistic addresses, subject to other ranking in the source address selection algorithm. This is useful where different interfaces are connected to different networks from different ISPs (e.g., a cell network and a home wifi network). The current behaviour complies with RFC 3484/6724, and it makes sense if the host has only one interface, or has multiple interfaces on the same network (same or cooperating administrative domain(s), but not in the multiple distinct networks case. For example, if a mobile device has an IPv6 address on an LTE network and then connects to IPv6-enabled wifi, while the wifi IPv6 address is undergoing DAD, IPv6 connections will try use the wifi default route with the LTE IPv6 address, and will get stuck until they time out. Also, because optimistic nodes can receive frames, issue an RTM_NEWADDR as soon as DAD starts (with the IFA_F_OPTIMSTIC flag appropriately set). A second RTM_NEWADDR is sent if DAD completes (the address flags have changed), otherwise an RTM_DELADDR is sent. Also: add an entry in ip-sysctl.txt for optimistic_dad. Signed-off-by: Erik Kline Acked-by: Lorenzo Colitti Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 ++++++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 46 ++++++++++++++++++++++++++++++++-- 4 files changed, 59 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 9028b879a97b..368e3251c553 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1460,6 +1460,19 @@ suppress_frag_ndisc - INTEGER 1 - (default) discard fragmented neighbor discovery packets 0 - allow fragmented neighbor discovery packets +optimistic_dad - BOOLEAN + Whether to perform Optimistic Duplicate Address Detection (RFC 4429). + 0: disabled (default) + 1: enabled + +use_optimistic - BOOLEAN + If enabled, do not classify optimistic addresses as deprecated during + source address selection. Preferred addresses will still be chosen + before optimistic addresses, subject to other ranking in the source + address selection algorithm. + 0: disabled (default) + 1: enabled + icmp/*: ratelimit - INTEGER Limit the maximal rates for sending ICMPv6 packets. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ff560537dd61..7121a2e97ce2 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -42,6 +42,7 @@ struct ipv6_devconf { __s32 accept_ra_from_local; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD __s32 optimistic_dad; + __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE __s32 mc_forwarding; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index efa2666f4b8a..e863d088b9a5 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -164,6 +164,7 @@ enum { DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL, DEVCONF_SUPPRESS_FRAG_NDISC, DEVCONF_ACCEPT_RA_FROM_LOCAL, + DEVCONF_USE_OPTIMISTIC, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 50b95b2db87c..8d12b7c7018f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1170,6 +1170,9 @@ enum { IPV6_SADDR_RULE_PRIVACY, IPV6_SADDR_RULE_ORCHID, IPV6_SADDR_RULE_PREFIX, +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + IPV6_SADDR_RULE_NOT_OPTIMISTIC, +#endif IPV6_SADDR_RULE_MAX }; @@ -1197,6 +1200,15 @@ static inline int ipv6_saddr_preferred(int type) return 0; } +static inline bool ipv6_use_optimistic_addr(struct inet6_dev *idev) +{ +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + return idev && idev->cnf.optimistic_dad && idev->cnf.use_optimistic; +#else + return false; +#endif +} + static int ipv6_get_saddr_eval(struct net *net, struct ipv6_saddr_score *score, struct ipv6_saddr_dst *dst, @@ -1257,10 +1269,16 @@ static int ipv6_get_saddr_eval(struct net *net, score->scopedist = ret; break; case IPV6_SADDR_RULE_PREFERRED: + { /* Rule 3: Avoid deprecated and optimistic addresses */ + u8 avoid = IFA_F_DEPRECATED; + + if (!ipv6_use_optimistic_addr(score->ifa->idev)) + avoid |= IFA_F_OPTIMISTIC; ret = ipv6_saddr_preferred(score->addr_type) || - !(score->ifa->flags & (IFA_F_DEPRECATED|IFA_F_OPTIMISTIC)); + !(score->ifa->flags & avoid); break; + } #ifdef CONFIG_IPV6_MIP6 case IPV6_SADDR_RULE_HOA: { @@ -1306,6 +1324,14 @@ static int ipv6_get_saddr_eval(struct net *net, ret = score->ifa->prefix_len; score->matchlen = ret; break; +#ifdef CONFIG_IPV6_OPTIMISTIC_DAD + case IPV6_SADDR_RULE_NOT_OPTIMISTIC: + /* Optimistic addresses still have lower precedence than other + * preferred addresses. + */ + ret = !(score->ifa->flags & IFA_F_OPTIMISTIC); + break; +#endif default: ret = 0; } @@ -3222,8 +3248,15 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) * Optimistic nodes can start receiving * Frames right away */ - if (ifp->flags & IFA_F_OPTIMISTIC) + if (ifp->flags & IFA_F_OPTIMISTIC) { ip6_ins_rt(ifp->rt); + if (ipv6_use_optimistic_addr(idev)) { + /* Because optimistic nodes can use this address, + * notify listeners. If DAD fails, RTM_DELADDR is sent. + */ + ipv6_ifa_notify(RTM_NEWADDR, ifp); + } + } addrconf_dad_kick(ifp); out: @@ -4330,6 +4363,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad; + array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; @@ -5155,6 +5189,14 @@ static struct addrconf_sysctl_table .proc_handler = proc_dointvec, }, + { + .procname = "use_optimistic", + .data = &ipv6_devconf.use_optimistic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, #endif #ifdef CONFIG_IPV6_MROUTE { -- cgit v1.2.3 From 7f05db6a20fe4d85bada20d365c78029831b9de1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 12 Oct 2014 11:34:00 +0300 Subject: kvm: drop unsupported capabilities, fix documentation No kernel ever reported KVM_CAP_DEVICE_MSIX, KVM_CAP_DEVICE_MSI, KVM_CAP_DEVICE_ASSIGNMENT, KVM_CAP_DEVICE_DEASSIGNMENT. This makes the documentation wrong, and no application ever written to use these capabilities has a chance to work correctly. The only way to detect support is to try, and test errno for ENOTTY. That's unfortunate, but we can't fix the past. Document the actual semantics, and drop the definitions from the exported header to make it easier for application developers to note and fix the bug. Signed-off-by: Michael S. Tsirkin Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 40 ++++++++++++++++++++++++++++++++------- include/uapi/linux/kvm.h | 8 -------- 2 files changed, 33 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7610eaa4d491..7a943c23db1c 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -68,9 +68,12 @@ description: Capability: which KVM extension provides this ioctl. Can be 'basic', which means that is will be provided by any kernel that supports - API version 12 (see section 4.1), or a KVM_CAP_xyz constant, which + API version 12 (see section 4.1), a KVM_CAP_xyz constant, which means availability needs to be checked with KVM_CHECK_EXTENSION - (see section 4.4). + (see section 4.4), or 'none' which means that while not all kernels + support this ioctl, there's no capability bit to check its + availability: for kernels that don't support the ioctl, + the ioctl returns -ENOTTY. Architectures: which instruction set architectures provide this ioctl. x86 includes both i386 and x86_64. @@ -1257,7 +1260,7 @@ The flags bitmap is defined as: 4.48 KVM_ASSIGN_PCI_DEVICE -Capability: KVM_CAP_DEVICE_ASSIGNMENT +Capability: none Architectures: x86 ia64 Type: vm ioctl Parameters: struct kvm_assigned_pci_dev (in) @@ -1298,10 +1301,16 @@ Only PCI header type 0 devices with PCI BAR resources are supported by device assignment. The user requesting this ioctl must have read/write access to the PCI sysfs resource files associated with the device. +Errors: + ENOTTY: kernel does not support this ioctl + + Other error conditions may be defined by individual device types or + have their standard meanings. + 4.49 KVM_DEASSIGN_PCI_DEVICE -Capability: KVM_CAP_DEVICE_DEASSIGNMENT +Capability: none Architectures: x86 ia64 Type: vm ioctl Parameters: struct kvm_assigned_pci_dev (in) @@ -1309,9 +1318,14 @@ Returns: 0 on success, -1 on error Ends PCI device assignment, releasing all associated resources. -See KVM_CAP_DEVICE_ASSIGNMENT for the data structure. Only assigned_dev_id is +See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is used in kvm_assigned_pci_dev to identify the device. +Errors: + ENOTTY: kernel does not support this ioctl + + Other error conditions may be defined by individual device types or + have their standard meanings. 4.50 KVM_ASSIGN_DEV_IRQ @@ -1346,6 +1360,12 @@ The following flags are defined: It is not valid to specify multiple types per host or guest IRQ. However, the IRQ type of host and guest can differ or can even be null. +Errors: + ENOTTY: kernel does not support this ioctl + + Other error conditions may be defined by individual device types or + have their standard meanings. + 4.51 KVM_DEASSIGN_DEV_IRQ @@ -1423,7 +1443,7 @@ struct kvm_irq_routing_s390_adapter { 4.53 KVM_ASSIGN_SET_MSIX_NR -Capability: KVM_CAP_DEVICE_MSIX +Capability: none Architectures: x86 ia64 Type: vm ioctl Parameters: struct kvm_assigned_msix_nr (in) @@ -1445,7 +1465,7 @@ struct kvm_assigned_msix_nr { 4.54 KVM_ASSIGN_SET_MSIX_ENTRY -Capability: KVM_CAP_DEVICE_MSIX +Capability: none Architectures: x86 ia64 Type: vm ioctl Parameters: struct kvm_assigned_msix_entry (in) @@ -1461,6 +1481,12 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +Errors: + ENOTTY: kernel does not support this ioctl + + Other error conditions may be defined by individual device types or + have their standard meanings. + 4.55 KVM_SET_TSC_KHZ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 60768822b140..6d59e5b39c9c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -647,11 +647,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_MP_STATE 14 #define KVM_CAP_COALESCED_MMIO 15 #define KVM_CAP_SYNC_MMU 16 /* Changes to host mmap are reflected in guest */ -#define KVM_CAP_DEVICE_ASSIGNMENT 17 #define KVM_CAP_IOMMU 18 -#ifdef __KVM_HAVE_MSI -#define KVM_CAP_DEVICE_MSI 20 -#endif /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21 #define KVM_CAP_USER_NMI 22 @@ -663,10 +659,6 @@ struct kvm_ppc_smmu_info { #endif #define KVM_CAP_IRQ_ROUTING 25 #define KVM_CAP_IRQ_INJECT_STATUS 26 -#define KVM_CAP_DEVICE_DEASSIGNMENT 27 -#ifdef __KVM_HAVE_MSIX -#define KVM_CAP_DEVICE_MSIX 28 -#endif #define KVM_CAP_ASSIGN_DEV_IRQ 29 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 -- cgit v1.2.3 From 7071cf7fc435ab84df872613f613a9a055964fc1 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sun, 2 Nov 2014 11:31:41 -0800 Subject: uapi: add missing network related headers to kbuild The makefile for sanitizing kernel headers uses the kbuild file to determine which files to do. Several networking related headers were missing. Without these headers iproute2 build would break. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index b70237e8bc37..4c94f31a8c99 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -125,6 +125,7 @@ header-y += filter.h header-y += firewire-cdev.h header-y += firewire-constants.h header-y += flat.h +header-y += fou.h header-y += fs.h header-y += fsl_hypervisor.h header-y += fuse.h @@ -141,6 +142,7 @@ header-y += hid.h header-y += hiddev.h header-y += hidraw.h header-y += hpet.h +header-y += hsr_netlink.h header-y += hyperv.h header-y += hysdn_if.h header-y += i2c-dev.h @@ -251,6 +253,7 @@ header-y += mii.h header-y += minix_fs.h header-y += mman.h header-y += mmtimer.h +header-y += mpls.h header-y += mqueue.h header-y += mroute.h header-y += mroute6.h @@ -424,6 +427,7 @@ header-y += virtio_net.h header-y += virtio_pci.h header-y += virtio_ring.h header-y += virtio_rng.h +header=y += vm_sockets.h header-y += vt.h header-y += wait.h header-y += wanrouter.h -- cgit v1.2.3 From 6e0bd6c35b021dc73a81ebd1ef79761233c48b50 Mon Sep 17 00:00:00 2001 From: Rostislav Lisovy Date: Mon, 3 Nov 2014 10:33:18 +0100 Subject: cfg80211: 802.11p OCB mode handling This patch adds new iface type (NL80211_IFTYPE_OCB) representing the OCB (Outside the Context of a BSS) mode. When establishing a connection to the network a cfg80211_join_ocb function is called (particular nl80211_command is added as well). A mandatory parameters during the ocb_join operation are 'center frequency' and 'channel width (5/10 MHz)'. Changes done in mac80211 are minimal possible required to avoid many warnings (warning: enumeration value 'NL80211_IFTYPE_OCB' not handled in switch) during compilation. Full functionality (where needed) is added in the following patch. Signed-off-by: Rostislav Lisovy Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 19 ++++++++++ include/uapi/linux/nl80211.h | 11 ++++++ net/mac80211/cfg.c | 1 + net/mac80211/chan.c | 2 + net/mac80211/iface.c | 5 +++ net/mac80211/util.c | 3 ++ net/wireless/Makefile | 2 +- net/wireless/chan.c | 8 ++++ net/wireless/core.c | 3 ++ net/wireless/core.h | 12 ++++++ net/wireless/nl80211.c | 39 ++++++++++++++++++++ net/wireless/ocb.c | 88 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 21 +++++++++++ net/wireless/trace.h | 21 +++++++++++ net/wireless/util.c | 5 ++- 15 files changed, 238 insertions(+), 2 deletions(-) create mode 100644 net/wireless/ocb.c (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f67948e18600..5c3acd07acd9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1358,6 +1358,16 @@ struct mesh_setup { u32 basic_rates; }; +/** + * struct ocb_setup - 802.11p OCB mode setup configuration + * @chandef: defines the channel to use + * + * These parameters are fixed when connecting to the network + */ +struct ocb_setup { + struct cfg80211_chan_def chandef; +}; + /** * struct ieee80211_txq_params - TX queue parameters * @ac: AC identifier @@ -2352,6 +2362,11 @@ struct cfg80211_qos_map { * with the peer followed by immediate teardown when the addition is later * rejected) * @del_tx_ts: remove an existing TX TS + * + * @join_ocb: join the OCB network with the specified parameters + * (invoked with the wireless_dev mutex held) + * @leave_ocb: leave the current OCB network + * (invoked with the wireless_dev mutex held) */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2433,6 +2448,10 @@ struct cfg80211_ops { const struct mesh_setup *setup); int (*leave_mesh)(struct wiphy *wiphy, struct net_device *dev); + int (*join_ocb)(struct wiphy *wiphy, struct net_device *dev, + struct ocb_setup *setup); + int (*leave_ocb)(struct wiphy *wiphy, struct net_device *dev); + int (*change_bss)(struct wiphy *wiphy, struct net_device *dev, struct bss_parameters *params); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f7daae59248e..9b3025e4377a 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -746,6 +746,11 @@ * destination %NL80211_ATTR_MAC on the interface identified by * %NL80211_ATTR_IFINDEX. * + * @NL80211_CMD_JOIN_OCB: Join the OCB network. The center frequency and + * bandwidth of a channel must be given. + * @NL80211_CMD_LEAVE_OCB: Leave the OCB network -- no special arguments, the + * network is determined by the network interface. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -922,6 +927,9 @@ enum nl80211_commands { NL80211_CMD_GET_MPP, + NL80211_CMD_JOIN_OCB, + NL80211_CMD_LEAVE_OCB, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2074,6 +2082,8 @@ enum nl80211_attrs { * and therefore can't be created in the normal ways, use the * %NL80211_CMD_START_P2P_DEVICE and %NL80211_CMD_STOP_P2P_DEVICE * commands to create and destroy one + * @NL80211_IF_TYPE_OCB: Outside Context of a BSS + * This mode corresponds to the MIB variable dot11OCBActivated=true * @NL80211_IFTYPE_MAX: highest interface type number currently defined * @NUM_NL80211_IFTYPES: number of defined interface types * @@ -2093,6 +2103,7 @@ enum nl80211_iftype { NL80211_IFTYPE_P2P_CLIENT, NL80211_IFTYPE_P2P_GO, NL80211_IFTYPE_P2P_DEVICE, + NL80211_IFTYPE_OCB, /* keep last */ NUM_NL80211_IFTYPES, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b9659b8b70f8..1e2afc95ad09 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -230,6 +230,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_OCB: /* shouldn't happen */ WARN_ON_ONCE(1); break; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index ee71bb6f64f7..ff1f877e3b63 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -270,6 +270,7 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: width = vif->bss_conf.chandef.width; break; case NL80211_IFTYPE_UNSPECIFIED: @@ -909,6 +910,7 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata) case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: ieee80211_queue_work(&sdata->local->hw, &sdata->csa_finalize_work); break; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1ffcc0701244..d69e7532095f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -521,6 +521,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_P2P_DEVICE: + case NL80211_IFTYPE_OCB: /* no special treatment */ break; case NL80211_IFTYPE_UNSPECIFIED: @@ -631,6 +632,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: netif_carrier_off(dev); break; case NL80211_IFTYPE_WDS: @@ -1351,6 +1353,9 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.bssid = sdata->u.mgd.bssid; ieee80211_sta_setup_sdata(sdata); break; + case NL80211_IFTYPE_OCB: + /* to be implemented in the future */ + break; case NL80211_IFTYPE_ADHOC: sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; ieee80211_ibss_setup_sdata(sdata); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 666aa1306c45..d7d69c89ff34 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1841,6 +1841,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_bss_info_change_notify(sdata, changed); sdata_unlock(sdata); break; + case NL80211_IFTYPE_OCB: + /* to be implemented in the future */ + break; case NL80211_IFTYPE_ADHOC: changed |= BSS_CHANGED_IBSS; /* fall through */ diff --git a/net/wireless/Makefile b/net/wireless/Makefile index a761670af31d..4c9e39f04ef8 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_WEXT_SPY) += wext-spy.o obj-$(CONFIG_WEXT_PRIV) += wext-priv.o cfg80211-y += core.o sysfs.o radiotap.o util.o reg.o scan.o nl80211.o -cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o +cfg80211-y += mlme.o ibss.o sme.o chan.o ethtool.o mesh.o ap.o trace.o ocb.o cfg80211-$(CONFIG_CFG80211_DEBUGFS) += debugfs.o cfg80211-$(CONFIG_CFG80211_WEXT) += wext-compat.o wext-sme.o cfg80211-$(CONFIG_CFG80211_INTERNAL_REGDB) += regdb.o diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 8f39e33e71bb..85506f1d0789 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -366,6 +366,7 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, break; case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: @@ -892,6 +893,13 @@ cfg80211_get_chan_state(struct wireless_dev *wdev, *radar_detect |= BIT(wdev->chandef.width); } return; + case NL80211_IFTYPE_OCB: + if (wdev->chandef.chan) { + *chan = wdev->chandef.chan; + *chanmode = CHAN_MODE_SHARED; + return; + } + break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: diff --git a/net/wireless/core.c b/net/wireless/core.c index da4dcb65ade4..a4d27927aba2 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -869,6 +869,9 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, case NL80211_IFTYPE_P2P_GO: __cfg80211_stop_ap(rdev, dev, true); break; + case NL80211_IFTYPE_OCB: + __cfg80211_leave_ocb(rdev, dev); + break; case NL80211_IFTYPE_WDS: /* must be handled by mac80211/driver, has no APIs */ break; diff --git a/net/wireless/core.h b/net/wireless/core.h index 7e3a3cef7df9..61ee664cf2bd 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -290,6 +290,18 @@ int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); +/* OCB */ +int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ocb_setup *setup); +int cfg80211_join_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ocb_setup *setup); +int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev); +int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev); + /* AP */ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, bool notify); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f7d918858d32..1a31736914e5 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -885,6 +885,7 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) return -ENOLINK; break; case NL80211_IFTYPE_UNSPECIFIED: + case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_P2P_DEVICE: case NL80211_IFTYPE_WDS: @@ -8275,6 +8276,28 @@ static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } +static int nl80211_join_ocb(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct ocb_setup setup = {}; + int err; + + err = nl80211_parse_chandef(rdev, info, &setup.chandef); + if (err) + return err; + + return cfg80211_join_ocb(rdev, dev, &setup); +} + +static int nl80211_leave_ocb(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + + return cfg80211_leave_ocb(rdev, dev); +} + static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -10218,6 +10241,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_JOIN_OCB, + .doit = nl80211_join_ocb, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_LEAVE_OCB, + .doit = nl80211_leave_ocb, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, #ifdef CONFIG_PM { .cmd = NL80211_CMD_GET_WOWLAN, diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c new file mode 100644 index 000000000000..c00d4a792319 --- /dev/null +++ b/net/wireless/ocb.c @@ -0,0 +1,88 @@ +/* + * OCB mode implementation + * + * Copyright: (c) 2014 Czech Technical University in Prague + * (c) 2014 Volkswagen Group Research + * Author: Rostislav Lisovy + * Funded by: Volkswagen Group Research + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include "nl80211.h" +#include "core.h" +#include "rdev-ops.h" + +int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ocb_setup *setup) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) + return -EOPNOTSUPP; + + if (WARN_ON(!setup->chandef.chan)) + return -EINVAL; + + err = rdev_join_ocb(rdev, dev, setup); + if (!err) + wdev->chandef = setup->chandef; + + return err; +} + +int cfg80211_join_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ocb_setup *setup) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_join_ocb(rdev, dev, setup); + wdev_unlock(wdev); + + return err; +} + +int __cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + ASSERT_WDEV_LOCK(wdev); + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) + return -EOPNOTSUPP; + + if (!rdev->ops->leave_ocb) + return -EOPNOTSUPP; + + err = rdev_leave_ocb(rdev, dev); + if (!err) + memset(&wdev->chandef, 0, sizeof(wdev->chandef)); + + return err; +} + +int cfg80211_leave_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + wdev_lock(wdev); + err = __cfg80211_leave_ocb(rdev, dev); + wdev_unlock(wdev); + + return err; +} diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 71b1db3cc645..1b3864cd50ca 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -348,6 +348,27 @@ static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ocb_setup *setup) +{ + int ret; + trace_rdev_join_ocb(&rdev->wiphy, dev, setup); + ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + int ret; + trace_rdev_leave_ocb(&rdev->wiphy, dev); + ret = rdev->ops->leave_ocb(&rdev->wiphy, dev); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_change_bss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct bss_parameters *params) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index cdb2c2ef1ae1..277a85df910e 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -600,6 +600,11 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss, TP_ARGS(wiphy, netdev) ); +DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), + TP_ARGS(wiphy, netdev) +); + DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) @@ -1316,6 +1321,22 @@ TRACE_EVENT(rdev_join_ibss, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid) ); +TRACE_EVENT(rdev_join_ocb, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const struct ocb_setup *setup), + TP_ARGS(wiphy, netdev, setup), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG) +); + TRACE_EVENT(rdev_set_wiphy_params, TP_PROTO(struct wiphy *wiphy, u32 changed), TP_ARGS(wiphy, changed), diff --git a/net/wireless/util.c b/net/wireless/util.c index 5e233a577d0f..d0ac795445b7 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -442,7 +442,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, break; case cpu_to_le16(0): if (iftype != NL80211_IFTYPE_ADHOC && - iftype != NL80211_IFTYPE_STATION) + iftype != NL80211_IFTYPE_STATION && + iftype != NL80211_IFTYPE_OCB) return -1; break; } @@ -519,6 +520,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, memcpy(hdr.addr3, skb->data, ETH_ALEN); hdrlen = 24; break; + case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_ADHOC: /* DA SA BSSID */ memcpy(hdr.addr1, skb->data, ETH_ALEN); @@ -937,6 +939,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, if (dev->ieee80211_ptr->use_4addr) break; /* fall through */ + case NL80211_IFTYPE_OCB: case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; -- cgit v1.2.3 From a7dd7957acf798ac406afd6631e64a27ac4a5bf1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 28 Jun 2014 09:00:27 -0400 Subject: NVMe: Update list of status codes Taken from the draft NVMe 1.1b specification. Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- include/uapi/linux/nvme.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 29a7d8619d8d..134518b0100b 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -440,9 +440,15 @@ enum { NVME_SC_FUSED_MISSING = 0xa, NVME_SC_INVALID_NS = 0xb, NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_SGL_INVALID_LAST = 0xd, + NVME_SC_SGL_INVALID_COUNT = 0xe, + NVME_SC_SGL_INVALID_DATA = 0xf, + NVME_SC_SGL_INVALID_METADATA = 0x10, + NVME_SC_SGL_INVALID_TYPE = 0x11, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -454,7 +460,15 @@ enum { NVME_SC_INVALID_VECTOR = 0x108, NVME_SC_INVALID_LOG_PAGE = 0x109, NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_FIRMWARE_NEEDS_RESET = 0x10b, + NVME_SC_INVALID_QUEUE = 0x10c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, + NVME_SC_FEATURE_NOT_PER_NS = 0x10f, + NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_INVALID_PI = 0x181, + NVME_SC_READ_ONLY = 0x182, NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, -- cgit v1.2.3 From 7963e521811ed19396d47793cc77d87c643ab891 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 12 Sep 2014 16:07:20 -0600 Subject: NVMe: Passthrough IOCTL for IO commands The NVME_IOCTL_SUBMIT_IO only works for IO commands with block data transfers and isn't usable for other NVMe commands like flush, data set management, or any sort of vendor unique command. The NVME_IOCTL_ADMIN_CMD, however, can easily be modified to accept arbitrary IO commands in addition to arbitrary admin commands without breaking backward compatibility. This patch just adds a new IOCTL to distinguish if the driver should submit the command on an IO or Admin queue. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 17 ++++++++++++----- include/uapi/linux/nvme.h | 5 ++++- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 93ee55ee3775..70be3a151eb7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1732,10 +1732,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) +static int nvme_user_cmd(struct nvme_dev *dev, + struct nvme_passthru_cmd __user *ucmd, bool ioq) { - struct nvme_admin_cmd cmd; + struct nvme_passthru_cmd cmd; struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); @@ -1774,6 +1774,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, ADMIN_TIMEOUT; if (length != cmd.data_len) status = -ENOMEM; + else if (ioq) + status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c, + &cmd.result, timeout); else status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); @@ -1799,7 +1802,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + return nvme_user_cmd(ns->dev, (void __user *)arg, false); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->dev, (void __user *)arg, true); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -2743,7 +2748,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) struct nvme_dev *dev = f->private_data; switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(dev, (void __user *)arg); + return nvme_user_cmd(dev, (void __user *)arg, false); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(dev, (void __user *)arg, true); default: return -ENOTTY; } diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 134518b0100b..97106556408f 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -503,7 +503,7 @@ struct nvme_user_io { __u16 appmask; }; -struct nvme_admin_cmd { +struct nvme_passthru_cmd { __u8 opcode; __u8 flags; __u16 rsvd1; @@ -524,8 +524,11 @@ struct nvme_admin_cmd { __u32 result; }; +#define nvme_admin_cmd nvme_passthru_cmd + #define NVME_IOCTL_ID _IO('N', 0x40) #define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) #endif /* _UAPI_LINUX_NVME_H */ -- cgit v1.2.3 From 60e0545cc92ce4172c7069d559568717f64af332 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Sep 2014 14:08:38 -0600 Subject: NVMe: Updates for 1.1 spec Updating commands and structures for NVMe 1.1 updates, mostly for nvme reservations. There are no additional in-kernel uses, but this is for the uapi. While doing this, I noticed that the software progress features was using the wrong value, so updating that value as well. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- include/uapi/linux/nvme.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 97106556408f..26386cf3db44 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -181,6 +181,22 @@ enum { NVME_LBART_ATTRIB_HIDE = 1 << 1, }; +struct nvme_reservation_status { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[13]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 hostid; + __le64 rkey; + } regctl_ds[]; +}; + /* I/O commands */ enum nvme_opcode { @@ -189,7 +205,12 @@ enum nvme_opcode { nvme_cmd_read = 0x02, nvme_cmd_write_uncor = 0x04, nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, nvme_cmd_dsm = 0x09, + nvme_cmd_resv_register = 0x0d, + nvme_cmd_resv_report = 0x0e, + nvme_cmd_resv_acquire = 0x11, + nvme_cmd_resv_release = 0x15, }; struct nvme_common_command { @@ -305,7 +326,11 @@ enum { NVME_FEAT_IRQ_CONFIG = 0x09, NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, - NVME_FEAT_SW_PROGRESS = 0x0c, + NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_SW_PROGRESS = 0x80, + NVME_FEAT_HOST_ID = 0x81, + NVME_FEAT_RESV_MASK = 0x82, + NVME_FEAT_RESV_PERSIST = 0x83, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, -- cgit v1.2.3 From b17f709a24013fcbb257f6f89b4d81ac9fdf0d18 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 4 Nov 2014 09:06:56 -0800 Subject: gue: TX support for using remote checksum offload option Add if_tunnel flag TUNNEL_ENCAP_FLAG_REMCSUM to configure remote checksum offload on an IP tunnel. Add logic in gue_build_header to insert remote checksum offload option. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/fou.h | 14 +++++++++++++- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/fou.c | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/net/fou.h b/include/net/fou.h index cf4ce8874f92..25b26ffcf1df 100644 --- a/include/net/fou.h +++ b/include/net/fou.h @@ -20,7 +20,19 @@ static size_t fou_encap_hlen(struct ip_tunnel_encap *e) static size_t gue_encap_hlen(struct ip_tunnel_encap *e) { - return sizeof(struct udphdr) + sizeof(struct guehdr); + size_t len; + bool need_priv = false; + + len = sizeof(struct udphdr) + sizeof(struct guehdr); + + if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { + len += GUE_PLEN_REMCSUM; + need_priv = true; + } + + len += need_priv ? GUE_LEN_PRIV : 0; + + return len; } #endif diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 280d9e092283..bd3cc11a431f 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -69,6 +69,7 @@ enum tunnel_encap_types { #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) #define TUNNEL_ENCAP_FLAG_CSUM6 (1<<1) +#define TUNNEL_ENCAP_FLAG_REMCSUM (1<<2) /* SIT-mode i_flags */ #define SIT_ISATAP 0x0001 diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a3b8c5b36303..fb0db99adf9e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -562,11 +562,19 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; - size_t optlen = 0; + size_t hdrlen, optlen = 0; __be16 sport; void *data; bool need_priv = false; + if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && + skb->ip_summed == CHECKSUM_PARTIAL) { + csum = false; + optlen += GUE_PLEN_REMCSUM; + type |= SKB_GSO_TUNNEL_REMCSUM; + need_priv = true; + } + optlen += need_priv ? GUE_LEN_PRIV : 0; skb = iptunnel_handle_offloads(skb, csum, type); @@ -578,7 +586,9 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); - skb_push(skb, sizeof(struct guehdr) + optlen); + hdrlen = sizeof(struct guehdr) + optlen; + + skb_push(skb, hdrlen); guehdr = (struct guehdr *)skb->data; @@ -597,7 +607,26 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, *flags = 0; data += GUE_LEN_PRIV; - /* Add private flags */ + if (type & SKB_GSO_TUNNEL_REMCSUM) { + u16 csum_start = skb_checksum_start_offset(skb); + __be16 *pd = data; + + if (csum_start < hdrlen) + return -EINVAL; + + csum_start -= hdrlen; + pd[0] = htons(csum_start); + pd[1] = htons(csum_start + skb->csum_offset); + + if (!skb_is_gso(skb)) { + skb->ip_summed = CHECKSUM_NONE; + skb->encapsulation = 0; + } + + *flags |= GUE_PFLAG_REMCSUM; + data += GUE_PLEN_REMCSUM; + } + } fou_build_udp(skb, e, fl4, protocol, sport); -- cgit v1.2.3 From 66f1c44887ba4f47d617f8ae21cf8e04e1892bd7 Mon Sep 17 00:00:00 2001 From: Gregory Fong Date: Tue, 4 Nov 2014 11:21:21 -0800 Subject: bridge: include in6.h in if_bridge.h for struct in6_addr if_bridge.h uses struct in6_addr ip6, but wasn't including the in6.h header. Thomas Backlund originally sent a patch to do this, but this revealed a redefinition issue: https://lkml.org/lkml/2013/1/13/116 The redefinition issue should have been fixed by the following Linux commits: ee262ad827f89e2dc7851ec2986953b5b125c6bc inet: defines IPPROTO_* needed for module alias generation cfd280c91253cc28e4919e349fa7a813b63e71e8 net: sync some IP headers with glibc and the following glibc commit: 6c82a2f8d7c8e21e39237225c819f182ae438db3 Coordinate IPv6 definitions for Linux and glibc so actually include the header now. Reported-by: Colin Guthrie Reported-by: Christiaan Welvaart Reported-by: Thomas Backlund Cc: Florian Fainelli Cc: Cong Wang Cc: David Miller Signed-off-by: Gregory Fong Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 39f621a9fe82..da17e456908d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -15,6 +15,7 @@ #include #include +#include #define SYSFS_BRIDGE_ATTR "bridge" #define SYSFS_BRIDGE_FDB "brforward" -- cgit v1.2.3 From 904326ecac022ebaeb39cdfb206fd3b6551cdfca Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Thu, 16 Oct 2014 16:54:21 -0400 Subject: tty,serial: Unify UPF_* and ASYNC_* flag definitions The userspace-defined ASYNC_* flags in include/uapi/linux/tty_flags.h are the authoritative bit definitions for the serial_struct flags, and thus for any derivative values or fields. Although the serial core provides the TIOCSSERIAL and TIOCGSERIAL ioctls to set and retrieve these flags from userspace, it defines these bits independently, as UPF_* macros. Define the UPF_* macros which are userspace-modifiable directly from the ASYNC_* symbolic constants. Add compile-time test to ensure the bits changeable by TIOCSSERIAL match the defined range in the uapi header. Add ASYNCB_MAGIC_MULTIPLIER to the uapi header since this bit is programmable by userspace. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 47 ++++++++++++++++++++++++++++-------------- include/uapi/linux/tty_flags.h | 6 +++++- 2 files changed, 37 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index bccf4bac22f5..ad9329669aba 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -160,21 +160,33 @@ struct uart_port { /* flags must be updated while holding port mutex */ upf_t flags; -#define UPF_FOURPORT ((__force upf_t) (1 << 1)) -#define UPF_SAK ((__force upf_t) (1 << 2)) -#define UPF_SPD_MASK ((__force upf_t) (0x1030)) -#define UPF_SPD_HI ((__force upf_t) (0x0010)) -#define UPF_SPD_VHI ((__force upf_t) (0x0020)) -#define UPF_SPD_CUST ((__force upf_t) (0x0030)) -#define UPF_SPD_SHI ((__force upf_t) (0x1000)) -#define UPF_SPD_WARP ((__force upf_t) (0x1010)) -#define UPF_SKIP_TEST ((__force upf_t) (1 << 6)) -#define UPF_AUTO_IRQ ((__force upf_t) (1 << 7)) -#define UPF_HARDPPS_CD ((__force upf_t) (1 << 11)) -#define UPF_LOW_LATENCY ((__force upf_t) (1 << 13)) -#define UPF_BUGGY_UART ((__force upf_t) (1 << 14)) + /* + * These flags must be equivalent to the flags defined in + * include/uapi/linux/tty_flags.h which are the userspace definitions + * assigned from the serial_struct flags in uart_set_info() + * [for bit definitions in the UPF_CHANGE_MASK] + * + * Bits [0..UPF_LAST_USER] are userspace defined/visible/changeable + * except bit 15 (UPF_NO_TXEN_TEST) which is masked off. + * The remaining bits are serial-core specific and not modifiable by + * userspace. + */ +#define UPF_FOURPORT ((__force upf_t) ASYNC_FOURPORT /* 1 */ ) +#define UPF_SAK ((__force upf_t) ASYNC_SAK /* 2 */ ) +#define UPF_SPD_HI ((__force upf_t) ASYNC_SPD_HI /* 4 */ ) +#define UPF_SPD_VHI ((__force upf_t) ASYNC_SPD_VHI /* 5 */ ) +#define UPF_SPD_CUST ((__force upf_t) ASYNC_SPD_CUST /* 0x0030 */ ) +#define UPF_SPD_WARP ((__force upf_t) ASYNC_SPD_WARP /* 0x1010 */ ) +#define UPF_SPD_MASK ((__force upf_t) ASYNC_SPD_MASK /* 0x1030 */ ) +#define UPF_SKIP_TEST ((__force upf_t) ASYNC_SKIP_TEST /* 6 */ ) +#define UPF_AUTO_IRQ ((__force upf_t) ASYNC_AUTO_IRQ /* 7 */ ) +#define UPF_HARDPPS_CD ((__force upf_t) ASYNC_HARDPPS_CD /* 11 */ ) +#define UPF_SPD_SHI ((__force upf_t) ASYNC_SPD_SHI /* 12 */ ) +#define UPF_LOW_LATENCY ((__force upf_t) ASYNC_LOW_LATENCY /* 13 */ ) +#define UPF_BUGGY_UART ((__force upf_t) ASYNC_BUGGY_UART /* 14 */ ) #define UPF_NO_TXEN_TEST ((__force upf_t) (1 << 15)) -#define UPF_MAGIC_MULTIPLIER ((__force upf_t) (1 << 16)) +#define UPF_MAGIC_MULTIPLIER ((__force upf_t) ASYNC_MAGIC_MULTIPLIER /* 16 */ ) + /* Port has hardware-assisted h/w flow control (iow, auto-RTS *not* auto-CTS) */ #define UPF_HARD_FLOW ((__force upf_t) (1 << 21)) /* Port has hardware-assisted s/w flow control */ @@ -190,9 +202,14 @@ struct uart_port { #define UPF_DEAD ((__force upf_t) (1 << 30)) #define UPF_IOREMAP ((__force upf_t) (1 << 31)) -#define UPF_CHANGE_MASK ((__force upf_t) (0x17fff)) +#define __UPF_CHANGE_MASK 0x17fff +#define UPF_CHANGE_MASK ((__force upf_t) __UPF_CHANGE_MASK) #define UPF_USR_MASK ((__force upf_t) (UPF_SPD_MASK|UPF_LOW_LATENCY)) +#if __UPF_CHANGE_MASK > ASYNC_FLAGS +#error Change mask not equivalent to userspace-visible bit defines +#endif + /* status must be updated while holding port lock */ upstat_t status; diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h index eefcb483a2c0..7b516f7ee7e6 100644 --- a/include/uapi/linux/tty_flags.h +++ b/include/uapi/linux/tty_flags.h @@ -6,6 +6,8 @@ * shared by the tty_port flags structures. * * Define ASYNCB_* for convenient use with {test,set,clear}_bit. + * + * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable */ #define ASYNCB_HUP_NOTIFY 0 /* Notify getty on hangups and closes * on the callout port */ @@ -26,7 +28,8 @@ #define ASYNCB_BUGGY_UART 14 /* This is a buggy UART, skip some safety * checks. Note: can be dangerous! */ #define ASYNCB_AUTOPROBE 15 /* Port was autoprobed by PCI or PNP code */ -#define ASYNCB_LAST_USER 15 +#define ASYNCB_MAGIC_MULTIPLIER 16 /* Use special CLK or divisor */ +#define ASYNCB_LAST_USER 16 /* Internal flags used only by kernel */ #define ASYNCB_INITIALIZED 31 /* Serial port was initialized */ @@ -57,6 +60,7 @@ #define ASYNC_LOW_LATENCY (1U << ASYNCB_LOW_LATENCY) #define ASYNC_BUGGY_UART (1U << ASYNCB_BUGGY_UART) #define ASYNC_AUTOPROBE (1U << ASYNCB_AUTOPROBE) +#define ASYNC_MAGIC_MULTIPLIER (1U << ASYNCB_MAGIC_MULTIPLIER) #define ASYNC_FLAGS ((1U << (ASYNCB_LAST_USER + 1)) - 1) #define ASYNC_USR_MASK (ASYNC_SPD_MASK|ASYNC_CALLOUT_NOHUP| \ -- cgit v1.2.3 From 352f86187e1d3e9e27d3be237103dc83a48f882c Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Thu, 16 Oct 2014 16:54:22 -0400 Subject: tty: Document defunct ASYNC_* bits in uapi header Note the serial_struct flags for which the kernel ignores and performs no action. The flags cannot be removed since they form part of the userspace interface via the TIOCSSERIAL/TIOCGSERIAL ioctls. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/tty_flags.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h index 7b516f7ee7e6..4e021e31c219 100644 --- a/include/uapi/linux/tty_flags.h +++ b/include/uapi/linux/tty_flags.h @@ -8,6 +8,7 @@ * Define ASYNCB_* for convenient use with {test,set,clear}_bit. * * Bits [0..ASYNCB_LAST_USER] are userspace defined/visible/changeable + * [x] in the bit comments indicates the flag is defunct and no longer used. */ #define ASYNCB_HUP_NOTIFY 0 /* Notify getty on hangups and closes * on the callout port */ @@ -19,15 +20,15 @@ #define ASYNCB_SKIP_TEST 6 /* Skip UART test during autoconfiguration */ #define ASYNCB_AUTO_IRQ 7 /* Do automatic IRQ during * autoconfiguration */ -#define ASYNCB_SESSION_LOCKOUT 8 /* Lock out cua opens based on session */ -#define ASYNCB_PGRP_LOCKOUT 9 /* Lock out cua opens based on pgrp */ -#define ASYNCB_CALLOUT_NOHUP 10 /* Don't do hangups for cua device */ +#define ASYNCB_SESSION_LOCKOUT 8 /* [x] Lock out cua opens based on session */ +#define ASYNCB_PGRP_LOCKOUT 9 /* [x] Lock out cua opens based on pgrp */ +#define ASYNCB_CALLOUT_NOHUP 10 /* [x] Don't do hangups for cua device */ #define ASYNCB_HARDPPS_CD 11 /* Call hardpps when CD goes high */ #define ASYNCB_SPD_SHI 12 /* Use 230400 instead of 38400 bps */ #define ASYNCB_LOW_LATENCY 13 /* Request low latency behaviour */ #define ASYNCB_BUGGY_UART 14 /* This is a buggy UART, skip some safety * checks. Note: can be dangerous! */ -#define ASYNCB_AUTOPROBE 15 /* Port was autoprobed by PCI or PNP code */ +#define ASYNCB_AUTOPROBE 15 /* [x] Port was autoprobed by PCI/PNP code */ #define ASYNCB_MAGIC_MULTIPLIER 16 /* Use special CLK or divisor */ #define ASYNCB_LAST_USER 16 -- cgit v1.2.3 From 31a171328e870c9f65d01191e51a75cde5f78ffd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Sep 2014 20:06:43 +0200 Subject: tty: serial: 8250_omap: add custom DMA-TX callback This patch provides mostly a copy of serial8250_tx_dma() + __dma_tx_complete() with the following extensions: - DMA bug At least on AM335x the following problem exists: Even if the TX FIFO is empty and a TX transfer is programmed (and started) the UART does not trigger the DMA transfer. After $TRESHOLD number of bytes have been written to the FIFO manually the UART reevaluates the whole situation and decides that now there is enough room in the FIFO and so the transfer begins. This problem has not been seen on DRA7 or beagle board xm (OMAP3). I am not sure if this is UART-IP core specific or DMA engine. The workaround is to use a threshold of one byte, program the DMA transfer minus one byte and then to put the first byte into the FIFO to kick start the transfer. - support for runtime PM RPM is enabled on start_tx(). We can't disable RPM on DMA complete callback because there is still data in the FIFO which is being sent. We have to wait until the FIFO is empty before we disable it. For this to happen we fake a TX sent error and enable THRI. Once the FIFO is empty we receive an interrupt and since the TTY-buffer is still empty we "put RPM" via __stop_tx(). Should it been filed then in the start_tx() path we should program the DMA transfer and remove the error flag and the THRI bit. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_omap.c | 144 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/serial_reg.h | 1 + 2 files changed, 145 insertions(+) (limited to 'include/uapi') diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index 2f653c48639d..5f183d197dfa 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "8250.h" @@ -29,6 +30,7 @@ #define UART_ERRATA_i202_MDR1_ACCESS (1 << 0) #define OMAP_UART_WER_HAS_TX_WAKEUP (1 << 1) +#define OMAP_DMA_TX_KICK (1 << 2) #define OMAP_UART_FCR_RX_TRIG 6 #define OMAP_UART_FCR_TX_TRIG 4 @@ -616,6 +618,148 @@ static void omap_8250_unthrottle(struct uart_port *port) pm_runtime_put_autosuspend(port->dev); } +#ifdef CONFIG_SERIAL_8250_DMA +static int omap_8250_tx_dma(struct uart_8250_port *p); + +static void omap_8250_dma_tx_complete(void *param) +{ + struct uart_8250_port *p = param; + struct uart_8250_dma *dma = p->dma; + struct circ_buf *xmit = &p->port.state->xmit; + unsigned long flags; + bool en_thri = false; + + dma_sync_single_for_cpu(dma->txchan->device->dev, dma->tx_addr, + UART_XMIT_SIZE, DMA_TO_DEVICE); + + spin_lock_irqsave(&p->port.lock, flags); + + dma->tx_running = 0; + + xmit->tail += dma->tx_size; + xmit->tail &= UART_XMIT_SIZE - 1; + p->port.icount.tx += dma->tx_size; + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(&p->port); + + if (!uart_circ_empty(xmit) && !uart_tx_stopped(&p->port)) { + int ret; + + ret = omap_8250_tx_dma(p); + if (ret) + en_thri = true; + + } else if (p->capabilities & UART_CAP_RPM) { + en_thri = true; + } + + if (en_thri) { + dma->tx_err = 1; + p->ier |= UART_IER_THRI; + serial_port_out(&p->port, UART_IER, p->ier); + } + + spin_unlock_irqrestore(&p->port.lock, flags); +} + +static int omap_8250_tx_dma(struct uart_8250_port *p) +{ + struct uart_8250_dma *dma = p->dma; + struct omap8250_priv *priv = p->port.private_data; + struct circ_buf *xmit = &p->port.state->xmit; + struct dma_async_tx_descriptor *desc; + unsigned int skip_byte = 0; + int ret; + + if (dma->tx_running) + return 0; + if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) { + + /* + * Even if no data, we need to return an error for the two cases + * below so serial8250_tx_chars() is invoked and properly clears + * THRI and/or runtime suspend. + */ + if (dma->tx_err || p->capabilities & UART_CAP_RPM) { + ret = -EBUSY; + goto err; + } + if (p->ier & UART_IER_THRI) { + p->ier &= ~UART_IER_THRI; + serial_out(p, UART_IER, p->ier); + } + return 0; + } + + dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); + if (priv->habit & OMAP_DMA_TX_KICK) { + u8 tx_lvl; + + /* + * We need to put the first byte into the FIFO in order to start + * the DMA transfer. For transfers smaller than four bytes we + * don't bother doing DMA at all. It seem not matter if there + * are still bytes in the FIFO from the last transfer (in case + * we got here directly from omap_8250_dma_tx_complete()). Bytes + * leaving the FIFO seem not to trigger the DMA transfer. It is + * really the byte that we put into the FIFO. + * If the FIFO is already full then we most likely got here from + * omap_8250_dma_tx_complete(). And this means the DMA engine + * just completed its work. We don't have to wait the complete + * 86us at 115200,8n1 but around 60us (not to mention lower + * baudrates). So in that case we take the interrupt and try + * again with an empty FIFO. + */ + tx_lvl = serial_in(p, UART_OMAP_TX_LVL); + if (tx_lvl == p->tx_loadsz) { + ret = -EBUSY; + goto err; + } + if (dma->tx_size < 4) { + ret = -EINVAL; + goto err; + } + skip_byte = 1; + } + + desc = dmaengine_prep_slave_single(dma->txchan, + dma->tx_addr + xmit->tail + skip_byte, + dma->tx_size - skip_byte, DMA_MEM_TO_DEV, + DMA_PREP_INTERRUPT | DMA_CTRL_ACK); + if (!desc) { + ret = -EBUSY; + goto err; + } + + dma->tx_running = 1; + + desc->callback = omap_8250_dma_tx_complete; + desc->callback_param = p; + + dma->tx_cookie = dmaengine_submit(desc); + + dma_sync_single_for_device(dma->txchan->device->dev, dma->tx_addr, + UART_XMIT_SIZE, DMA_TO_DEVICE); + + dma_async_issue_pending(dma->txchan); + if (dma->tx_err) + dma->tx_err = 0; + + if (p->ier & UART_IER_THRI) { + p->ier &= ~UART_IER_THRI; + serial_out(p, UART_IER, p->ier); + } + if (skip_byte) + serial_out(p, UART_TX, xmit->buf[xmit->tail]); + return 0; +err: + dma->tx_err = 1; + return ret; +} + +#endif + static int omap8250_probe(struct platform_device *pdev) { struct resource *regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); diff --git a/include/uapi/linux/serial_reg.h b/include/uapi/linux/serial_reg.h index df6c9ab6b0cd..53af3b790129 100644 --- a/include/uapi/linux/serial_reg.h +++ b/include/uapi/linux/serial_reg.h @@ -359,6 +359,7 @@ #define UART_OMAP_SYSC 0x15 /* System configuration register */ #define UART_OMAP_SYSS 0x16 /* System status register */ #define UART_OMAP_WER 0x17 /* Wake-up enable register */ +#define UART_OMAP_TX_LVL 0x1a /* TX FIFO level register */ /* * These are the definitions for the MDR1 register -- cgit v1.2.3 From 5ac9c05d789044d30735402f387355a394ec8e18 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Wed, 5 Nov 2014 12:26:30 -0500 Subject: tty: Document defunct ASYNC_SPLIT_TERMIOS flag in uapi header The last vestige of ASYNC_SPLIT_TERMIOS was removed by commit 'cris: Remove obsolete ASYNC_SPLIT_TERMIOS behavior'. Mark the flag as defunct in the uapi header. Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/tty_flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h index 4e021e31c219..d2bc4ff94f43 100644 --- a/include/uapi/linux/tty_flags.h +++ b/include/uapi/linux/tty_flags.h @@ -14,7 +14,7 @@ * on the callout port */ #define ASYNCB_FOURPORT 1 /* Set OU1, OUT2 per AST Fourport settings */ #define ASYNCB_SAK 2 /* Secure Attention Key (Orange book) */ -#define ASYNCB_SPLIT_TERMIOS 3 /* Separate termios for dialin/callout */ +#define ASYNCB_SPLIT_TERMIOS 3 /* [x] Separate termios for dialin/callout */ #define ASYNCB_SPD_HI 4 /* Use 56000 instead of 38400 bps */ #define ASYNCB_SPD_VHI 5 /* Use 115200 instead of 38400 bps */ #define ASYNCB_SKIP_TEST 6 /* Skip UART test during autoconfiguration */ -- cgit v1.2.3 From 68952076e9226cc23ebce66d3fc2fdb8b6c04c30 Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Wed, 5 Nov 2014 12:26:31 -0500 Subject: vt: Remove vt_get_kmsg_redirect() from uapi header vt_get_kmsg_redirect() only has meaning to the console driver as an alias for calling vt_kmsg_redirect(). Move the macro definition to the only source file which uses it; remove from uapi/linux/vt.h Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 2 ++ include/uapi/linux/vt.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index b33b00b386de..3dc5d56261a5 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -2509,6 +2509,8 @@ int vt_kmsg_redirect(int new) return kmsg_con; } +#define vt_get_kmsg_redirect() vt_kmsg_redirect(-1) + /* * Console on virtual terminal * diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h index 4b59a26799a3..978578bd1895 100644 --- a/include/uapi/linux/vt.h +++ b/include/uapi/linux/vt.h @@ -84,7 +84,4 @@ struct vt_setactivate { #define VT_SETACTIVATE 0x560F /* Activate and set the mode of a console */ - -#define vt_get_kmsg_redirect() vt_kmsg_redirect(-1) - #endif /* _UAPI_LINUX_VT_H */ -- cgit v1.2.3 From 25cd9ba0abc0749e5cb78e6493c6f6b3311ec6c5 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 6 Oct 2014 05:05:13 -0700 Subject: openvswitch: Add basic MPLS support to kernel Allow datapath to recognize and extract MPLS labels into flow keys and execute actions which push, pop, and set labels on packets. Based heavily on work by Leo Alterman, Ravi K, Isaku Yamahata and Joe Stringer. Cc: Ravi K Cc: Leo Alterman Cc: Isaku Yamahata Cc: Joe Stringer Signed-off-by: Simon Horman Signed-off-by: Jesse Gross Signed-off-by: Pravin B Shelar --- include/net/mpls.h | 39 +++++++++++ include/uapi/linux/openvswitch.h | 32 +++++++++ net/core/dev.c | 3 +- net/openvswitch/Kconfig | 1 + net/openvswitch/actions.c | 106 ++++++++++++++++++++++++++++- net/openvswitch/datapath.c | 6 +- net/openvswitch/flow.c | 30 +++++++++ net/openvswitch/flow.h | 17 +++-- net/openvswitch/flow_netlink.c | 139 ++++++++++++++++++++++++++++++++++----- net/openvswitch/flow_netlink.h | 2 +- 10 files changed, 345 insertions(+), 30 deletions(-) create mode 100644 include/net/mpls.h (limited to 'include/uapi') diff --git a/include/net/mpls.h b/include/net/mpls.h new file mode 100644 index 000000000000..5b3b5addfb08 --- /dev/null +++ b/include/net/mpls.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_H +#define _NET_MPLS_H 1 + +#include +#include + +#define MPLS_HLEN 4 + +static inline bool eth_p_mpls(__be16 eth_type) +{ + return eth_type == htons(ETH_P_MPLS_UC) || + eth_type == htons(ETH_P_MPLS_MC); +} + +/* + * For non-MPLS skbs this will correspond to the network header. + * For MPLS skbs it will be before the network_header as the MPLS + * label stack lies between the end of the mac header and the network + * header. That is, for MPLS skbs the end of the mac header + * is the top of the MPLS label stack. + */ +static inline unsigned char *skb_mpls_header(struct sk_buff *skb) +{ + return skb_mac_header(skb) + skb->mac_len; +} +#endif diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 435eabc5ffaa..631056b66f80 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -293,6 +293,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_DP_HASH, /* u32 hash value. Value 0 indicates the hash is not computed by the datapath. */ OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ + OVS_KEY_ATTR_MPLS, /* array of struct ovs_key_mpls. + * The implementation may restrict + * the accepted length of the array. */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ @@ -340,6 +343,10 @@ struct ovs_key_ethernet { __u8 eth_dst[ETH_ALEN]; }; +struct ovs_key_mpls { + __be32 mpls_lse; +}; + struct ovs_key_ipv4 { __be32 ipv4_src; __be32 ipv4_dst; @@ -483,6 +490,19 @@ enum ovs_userspace_attr { #define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1) +/** + * struct ovs_action_push_mpls - %OVS_ACTION_ATTR_PUSH_MPLS action argument. + * @mpls_lse: MPLS label stack entry to push. + * @mpls_ethertype: Ethertype to set in the encapsulating ethernet frame. + * + * The only values @mpls_ethertype should ever be given are %ETH_P_MPLS_UC and + * %ETH_P_MPLS_MC, indicating MPLS unicast or multicast. Other are rejected. + */ +struct ovs_action_push_mpls { + __be32 mpls_lse; + __be16 mpls_ethertype; /* Either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC */ +}; + /** * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument. * @vlan_tpid: Tag protocol identifier (TPID) to push. @@ -534,6 +554,15 @@ struct ovs_action_hash { * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet. * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in * the nested %OVS_SAMPLE_ATTR_* attributes. + * @OVS_ACTION_ATTR_PUSH_MPLS: Push a new MPLS label stack entry onto the + * top of the packets MPLS label stack. Set the ethertype of the + * encapsulating frame to either %ETH_P_MPLS_UC or %ETH_P_MPLS_MC to + * indicate the new packet contents. + * @OVS_ACTION_ATTR_POP_MPLS: Pop an MPLS label stack entry off of the + * packet's MPLS label stack. Set the encapsulating frame's ethertype to + * indicate the new packet contents. This could potentially still be + * %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there + * is no MPLS label stack, as determined by ethertype, no action is taken. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -550,6 +579,9 @@ enum ovs_action_attr { OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ OVS_ACTION_ATTR_RECIRC, /* u32 recirc_id. */ OVS_ACTION_ATTR_HASH, /* struct ovs_action_hash. */ + OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ + OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ + __OVS_ACTION_ATTR_MAX }; diff --git a/net/core/dev.c b/net/core/dev.c index 40be481268de..70bb609c283d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -118,6 +118,7 @@ #include #include #include +#include #include #include #include @@ -2530,7 +2531,7 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb, netdev_features_t features, __be16 type) { - if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC)) + if (eth_p_mpls(type)) features &= skb->dev->mpls_features; return features; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 2a9673e39ca1..454ce12efbbf 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -30,6 +30,7 @@ config OPENVSWITCH config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" + select NET_MPLS_GSO depends on INET depends on OPENVSWITCH depends on NET_IPGRE_DEMUX diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 922c133b1933..930b1b6e4cef 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -28,10 +28,12 @@ #include #include #include + #include #include #include #include +#include #include #include "datapath.h" @@ -118,6 +120,92 @@ static int make_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } +static int push_mpls(struct sk_buff *skb, + const struct ovs_action_push_mpls *mpls) +{ + __be32 *new_mpls_lse; + struct ethhdr *hdr; + + /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -ENOTSUPP; + + if (skb_cow_head(skb, MPLS_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + + new_mpls_lse = (__be32 *)skb_mpls_header(skb); + *new_mpls_lse = mpls->mpls_lse; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, + MPLS_HLEN, 0)); + + hdr = eth_hdr(skb); + hdr->h_proto = mpls->mpls_ethertype; + + skb_set_inner_protocol(skb, skb->protocol); + skb->protocol = mpls->mpls_ethertype; + + return 0; +} + +static int pop_mpls(struct sk_buff *skb, const __be16 ethertype) +{ + struct ethhdr *hdr; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, + csum_partial(skb_mpls_header(skb), + MPLS_HLEN, 0)); + + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + + /* skb_mpls_header() is used to locate the ethertype + * field correctly in the presence of VLAN tags. + */ + hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + hdr->h_proto = ethertype; + if (eth_p_mpls(skb->protocol)) + skb->protocol = ethertype; + return 0; +} + +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse) +{ + __be32 *stack; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + stack = (__be32 *)skb_mpls_header(skb); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), *mpls_lse }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + *stack = *mpls_lse; + + return 0; +} + /* remove VLAN header from packet and update csum accordingly. */ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) { @@ -140,10 +228,12 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) skb_set_network_header(skb, ETH_HLEN); - skb_reset_mac_len(skb); + /* Update mac_len for subsequent MPLS actions */ + skb_reset_mac_len(skb); return 0; } @@ -186,6 +276,8 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) return -ENOMEM; + /* Update mac_len for subsequent MPLS actions */ + skb->mac_len += VLAN_HLEN; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(skb->data @@ -612,6 +704,10 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_SCTP: err = set_sctp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, nla_data(nested_attr)); + break; } return err; @@ -690,6 +786,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, execute_hash(skb, key, a); break; + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, nla_get_be16(a)); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f18302f32049..688cb9bc0ef1 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -560,7 +560,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], - &flow->key, 0, &acts); + &flow->key, &acts); if (err) goto err_flow_free; @@ -846,7 +846,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_flow; error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - 0, &acts); + &acts); if (error) { OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); goto err_kfree_acts; @@ -953,7 +953,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, return acts; ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, 0, &acts); + error = ovs_nla_copy_actions(a, &masked_key, &acts); if (error) { OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); kfree(acts); diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 2b78789ea7c5..90a21010fc8f 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include "datapath.h" @@ -480,6 +482,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) return -ENOMEM; skb_reset_network_header(skb); + skb_reset_mac_len(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); /* Network layer. */ @@ -584,6 +587,33 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ip, 0, sizeof(key->ip)); memset(&key->ipv4, 0, sizeof(key->ipv4)); } + } else if (eth_p_mpls(key->eth.type)) { + size_t stack_len = MPLS_HLEN; + + /* In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * Advance network_header to the beginning of the L3 + * header. mac_len corresponds to the end of the L2 header. + */ + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + stack_len); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + + if (stack_len == MPLS_HLEN) + memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + + skb_set_network_header(skb, skb->mac_len + stack_len); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + stack_len += MPLS_HLEN; + } } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 71813318c8c7..4962bee81a11 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -102,12 +102,17 @@ struct sw_flow_key { __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ __be16 type; /* Ethernet frame type. */ } eth; - struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; struct { __be16 src; /* TCP/UDP/SCTP source port. */ __be16 dst; /* TCP/UDP/SCTP destination port. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 939bcb32100f..569309c49cc0 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "flow_netlink.h" @@ -134,7 +135,8 @@ static bool match_validate(const struct sw_flow_match *match, | (1 << OVS_KEY_ATTR_ICMP) | (1 << OVS_KEY_ATTR_ICMPV6) | (1 << OVS_KEY_ATTR_ARP) - | (1 << OVS_KEY_ATTR_ND)); + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS)); /* Always allowed mask fields. */ mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) @@ -149,6 +151,12 @@ static bool match_validate(const struct sw_flow_match *match, mask_allowed |= 1 << OVS_KEY_ATTR_ARP; } + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + if (match->key->eth.type == htons(ETH_P_IP)) { key_expected |= 1 << OVS_KEY_ATTR_IPV4; if (match->mask && (match->mask->key.eth.type == htons(0xffff))) @@ -266,6 +274,7 @@ static const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32), [OVS_KEY_ATTR_DP_HASH] = sizeof(u32), [OVS_KEY_ATTR_TUNNEL] = -1, + [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), }; static bool is_all_zero(const u8 *fp, size_t size) @@ -735,6 +744,16 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, attrs &= ~(1 << OVS_KEY_ATTR_ARP); } + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + SW_FLOW_KEY_PUT(match, mpls.top_lse, + mpls_key->mpls_lse, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { const struct ovs_key_tcp *tcp_key; @@ -1140,6 +1159,14 @@ int ovs_nla_put_flow(const struct sw_flow_key *swkey, arp_key->arp_op = htons(output->ip.proto); ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = output->mpls.top_lse; } if ((swkey->eth.type == htons(ETH_P_IP) || @@ -1336,9 +1363,15 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } +static int ovs_nla_copy_actions__(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci); + static int validate_and_copy_sample(const struct nlattr *attr, const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -1375,7 +1408,8 @@ static int validate_and_copy_sample(const struct nlattr *attr, if (st_acts < 0) return st_acts; - err = ovs_nla_copy_actions(actions, key, depth + 1, sfa); + err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa, + eth_type, vlan_tci); if (err) return err; @@ -1385,10 +1419,10 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } -static int validate_tp_port(const struct sw_flow_key *flow_key) +static int validate_tp_port(const struct sw_flow_key *flow_key, + __be16 eth_type) { - if ((flow_key->eth.type == htons(ETH_P_IP) || - flow_key->eth.type == htons(ETH_P_IPV6)) && + if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) && (flow_key->tp.src || flow_key->tp.dst)) return 0; @@ -1483,7 +1517,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun) + bool *set_tun, __be16 eth_type) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -1508,6 +1542,9 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_TUNNEL: + if (eth_p_mpls(eth_type)) + return -EINVAL; + *set_tun = true; err = validate_and_copy_set_tun(a, sfa); if (err) @@ -1515,7 +1552,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) + if (eth_type != htons(ETH_P_IP)) return -EINVAL; if (!flow_key->ip.proto) @@ -1531,7 +1568,7 @@ static int validate_set(const struct nlattr *a, break; case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) + if (eth_type != htons(ETH_P_IPV6)) return -EINVAL; if (!flow_key->ip.proto) @@ -1553,19 +1590,24 @@ static int validate_set(const struct nlattr *a, if (flow_key->ip.proto != IPPROTO_TCP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); case OVS_KEY_ATTR_UDP: if (flow_key->ip.proto != IPPROTO_UDP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; case OVS_KEY_ATTR_SCTP: if (flow_key->ip.proto != IPPROTO_SCTP) return -EINVAL; - return validate_tp_port(flow_key); + return validate_tp_port(flow_key, eth_type); default: return -EINVAL; @@ -1609,12 +1651,13 @@ static int copy_action(const struct nlattr *from, return 0; } -int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) +static int ovs_nla_copy_actions__(const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci) { const struct nlattr *a; + bool out_tnl_port = false; int rem, err; if (depth >= SAMPLE_ACTION_DEPTH) @@ -1626,6 +1669,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, @@ -1655,6 +1700,8 @@ int ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_OUTPUT: if (nla_get_u32(a) >= DP_MAX_PORTS) return -EINVAL; + out_tnl_port = false; + break; case OVS_ACTION_ATTR_HASH: { @@ -1671,6 +1718,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, } case OVS_ACTION_ATTR_POP_VLAN: + vlan_tci = htons(0); break; case OVS_ACTION_ATTR_PUSH_VLAN: @@ -1679,19 +1727,66 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return -EINVAL; if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) return -EINVAL; + vlan_tci = vlan->vlan_tci; break; case OVS_ACTION_ATTR_RECIRC: break; + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + /* Networking stack do not allow simultaneous Tunnel + * and MPLS GSO. + */ + if (out_tnl_port) + return -EINVAL; + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions is planned using packet + * recirculation. + */ + eth_type = htons(0); + break; + case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); + err = validate_set(a, key, sfa, + &out_tnl_port, eth_type); if (err) return err; + + skip_copy = out_tnl_port; break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); + err = validate_and_copy_sample(a, key, depth, sfa, + eth_type, vlan_tci); if (err) return err; skip_copy = true; @@ -1713,6 +1808,14 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return 0; } +int ovs_nla_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa) +{ + return ovs_nla_copy_actions__(attr, key, 0, sfa, key->eth.type, + key->eth.tci); +} + static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { const struct nlattr *a; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 206e45add888..6355b1d01329 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -49,7 +49,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *); int ovs_nla_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, + const struct sw_flow_key *key, struct sw_flow_actions **sfa); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); -- cgit v1.2.3 From 1a4e96a0e989200f7180264e27b22e9a85f3fcc8 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Tue, 30 Sep 2014 10:52:32 -0700 Subject: openvswitch: Fix the type of struct ovs_key_nd nd_target field. Should be the same as other IPv6 address fields. Current master produces sparse warnings without this change. Signed-off-by: Jarno Rajahalme Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 631056b66f80..26c36c4cf7e2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -400,9 +400,9 @@ struct ovs_key_arp { }; struct ovs_key_nd { - __u32 nd_target[4]; - __u8 nd_sll[ETH_ALEN]; - __u8 nd_tll[ETH_ALEN]; + __be32 nd_target[4]; + __u8 nd_sll[ETH_ALEN]; + __u8 nd_tll[ETH_ALEN]; }; /** -- cgit v1.2.3 From 9b8777e3473e31b2aabd669e5f34866d4a3afb6a Mon Sep 17 00:00:00 2001 From: John Crispin Date: Thu, 16 Oct 2014 21:48:21 +0200 Subject: serial: of: add a PORT_RT2880 definition The Ralink RT2880 SoC and its successors have an internal 8250 core. This core needs the same quirks applied as the AMD AU1xxx uart. In addition to these quirks, the ports memory region is only 0x100 unlike the AU1xxx which has a size of 0x1000. Signed-off-by: John Crispin Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 5 ++++- drivers/tty/serial/of_serial.c | 10 +++++++++- include/uapi/linux/serial_core.h | 3 ++- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 7e78f3077b5d..223503299e99 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -2640,8 +2640,11 @@ serial8250_pm(struct uart_port *port, unsigned int state, static unsigned int serial8250_port_size(struct uart_8250_port *pt) { - if (pt->port.iotype == UPIO_AU) + if (pt->port.iotype == UPIO_AU) { + if (pt->port.type == PORT_RT2880) + return 0x100; return 0x1000; + } if (is_omap1_8250(pt)) return 0x16 << pt->port.regshift; diff --git a/drivers/tty/serial/of_serial.c b/drivers/tty/serial/of_serial.c index 9c64ad2ac1a8..8749fb849803 100644 --- a/drivers/tty/serial/of_serial.c +++ b/drivers/tty/serial/of_serial.c @@ -130,8 +130,15 @@ static int of_platform_serial_setup(struct platform_device *ofdev, port->dev = &ofdev->dev; - if (type == PORT_TEGRA) + switch (type) { + case PORT_TEGRA: port->handle_break = tegra_serial_handle_break; + break; + + case PORT_RT2880: + port->iotype = UPIO_AU; + break; + } return 0; out: @@ -317,6 +324,7 @@ static struct of_device_id of_platform_serial_table[] = { { .compatible = "ns16850", .data = (void *)PORT_16850, }, { .compatible = "nvidia,tegra20-uart", .data = (void *)PORT_TEGRA, }, { .compatible = "nxp,lpc3220-uart", .data = (void *)PORT_LPC3220, }, + { .compatible = "ralink,rt2880-uart", .data = (void *)PORT_RT2880, }, { .compatible = "altr,16550-FIFO32", .data = (void *)PORT_ALTR_16550_F32, }, { .compatible = "altr,16550-FIFO64", diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index 16ad8521af6a..c17218094f18 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -54,7 +54,8 @@ #define PORT_ALTR_16550_F32 26 /* Altera 16550 UART with 32 FIFOs */ #define PORT_ALTR_16550_F64 27 /* Altera 16550 UART with 64 FIFOs */ #define PORT_ALTR_16550_F128 28 /* Altera 16550 UART with 128 FIFOs */ -#define PORT_MAX_8250 28 /* max port ID */ +#define PORT_RT2880 29 /* Ralink RT2880 internal UART */ +#define PORT_MAX_8250 29 /* max port ID */ /* * ARM specific type numbers. These are not currently guaranteed -- cgit v1.2.3 From 8a8ae62f8296760a2a1eee7009a1444c327603e0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 6 Nov 2014 16:56:33 +0100 Subject: tty: warn on deprecated serial flags When somebody calls TIOCSSERIAL ioctl with serial flags to set one of * ASYNC_SESSION_LOCKOUT * ASYNC_PGRP_LOCKOUT * ASYNC_CALLOUT_NOHUP * ASYNC_AUTOPROBE nothing happens. We actually ignore the flags for over a decade at least (I checked 2.6.0). So start yelling at users who use those flags, that they shouldn't. Signed-off-by: Jiri Slaby Cc: Peter Hurley Cc: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 21 +++++++++++++++++++++ include/uapi/linux/tty_flags.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'include/uapi') diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 01d45fd7d359..705885891b87 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2760,6 +2760,24 @@ static int tty_tiocgicount(struct tty_struct *tty, void __user *arg) return 0; } +static void tty_warn_deprecated_flags(struct serial_struct __user *ss) +{ + static DEFINE_RATELIMIT_STATE(depr_flags, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + char comm[TASK_COMM_LEN]; + int flags; + + if (get_user(flags, &ss->flags)) + return; + + flags &= ASYNC_DEPRECATED; + + if (flags && __ratelimit(&depr_flags)) + pr_warning("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", + __func__, get_task_comm(comm, current), flags); +} + /* * if pty, return the slave side (real_tty) * otherwise, return self @@ -2903,6 +2921,9 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; } break; + case TIOCSSERIAL: + tty_warn_deprecated_flags(p); + break; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); diff --git a/include/uapi/linux/tty_flags.h b/include/uapi/linux/tty_flags.h index d2bc4ff94f43..fae4864737fa 100644 --- a/include/uapi/linux/tty_flags.h +++ b/include/uapi/linux/tty_flags.h @@ -64,6 +64,8 @@ #define ASYNC_MAGIC_MULTIPLIER (1U << ASYNCB_MAGIC_MULTIPLIER) #define ASYNC_FLAGS ((1U << (ASYNCB_LAST_USER + 1)) - 1) +#define ASYNC_DEPRECATED (ASYNC_SESSION_LOCKOUT | ASYNC_PGRP_LOCKOUT | \ + ASYNC_CALLOUT_NOHUP | ASYNC_AUTOPROBE) #define ASYNC_USR_MASK (ASYNC_SPD_MASK|ASYNC_CALLOUT_NOHUP| \ ASYNC_LOW_LATENCY) #define ASYNC_SPD_CUST (ASYNC_SPD_HI|ASYNC_SPD_VHI) -- cgit v1.2.3 From 70f2f5c70440feff01d9ba7c8b4432eb72bd69eb Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 24 Oct 2014 12:11:11 +0100 Subject: drm/i915: Report the actual swizzling back to userspace Userspace cares about whether or not swizzling depends on the page address for its direct access into bound objects. Extend the get_tiling ioctl to report the physical swizzling value in addition to the logical swizzling value so that userspace can accurately determine when it is possible for manual detiling. Signed-off-by: Chris Wilson Cc: Akash Goel Cc: Daniel Vetter Testcase: igt/gem_tiled_wc Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_gem_tiling.c | 1 + include/uapi/drm/i915_drm.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c index d1e7a3e088aa..749ab485569e 100644 --- a/drivers/gpu/drm/i915/i915_gem_tiling.c +++ b/drivers/gpu/drm/i915/i915_gem_tiling.c @@ -458,6 +458,7 @@ i915_gem_get_tiling(struct drm_device *dev, void *data, } /* Hide bit 17 from the user -- see comment in i915_gem_set_tiling */ + args->phys_swizzle_mode = args->swizzle_mode; if (args->swizzle_mode == I915_BIT_6_SWIZZLE_9_17) args->swizzle_mode = I915_BIT_6_SWIZZLE_9; if (args->swizzle_mode == I915_BIT_6_SWIZZLE_9_10_17) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index ff57f07c3249..2ec0efcaa719 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -876,6 +876,12 @@ struct drm_i915_gem_get_tiling { * mmap mapping. */ __u32 swizzle_mode; + + /** + * Returned address bit 6 swizzling required for CPU access through + * mmap mapping whilst bound. + */ + __u32 phys_swizzle_mode; }; struct drm_i915_gem_get_aperture { -- cgit v1.2.3 From 31d4ea1a093fcf668d5f95af44b8d41488bdb7ec Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 24 Oct 2014 12:20:27 +0200 Subject: Drivers: hv: util: make struct hv_do_fcopy match Hyper-V host messages An attempt to fix fcopy on i586 (bc5a5b0 Drivers: hv: util: Properly pack the data for file copy functionality) led to a regression on x86_64 (and actually didn't fix i586 breakage). Fcopy messages from Hyper-V host come in the following format: struct do_fcopy_hdr | 36 bytes 0000 | 4 bytes offset | 8 bytes size | 4 bytes data | 6144 bytes On x86_64 struct hv_do_fcopy matched this format without ' __attribute__((packed))' and on i586 adding ' __attribute__((packed))' to it doesn't change anything. Keep the structure packed and add padding to match re reality. Tested both i586 and x86_64 on Hyper-V Server 2012 R2. Signed-off-by: Vitaly Kuznetsov Signed-off-by: K. Y. Srinivasan Cc: Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/hyperv.h b/include/uapi/linux/hyperv.h index 0a8e6badb29b..bb1cb73c927a 100644 --- a/include/uapi/linux/hyperv.h +++ b/include/uapi/linux/hyperv.h @@ -134,6 +134,7 @@ struct hv_start_fcopy { struct hv_do_fcopy { struct hv_fcopy_hdr hdr; + __u32 pad; __u64 offset; __u32 size; __u8 data[DATA_FRAGMENT]; -- cgit v1.2.3 From df32dd2054b6edcbdfd3a31aec24e7fd0edba2ac Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 3 Nov 2014 12:42:34 -0800 Subject: uapi: resort Kbuild entries The entries in the Kbuild files are incorrectly sorted. Matters for aesthetics only. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 88 +++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 44 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 4c94f31a8c99..72298b6887ac 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -37,27 +37,27 @@ header-y += aio_abi.h header-y += apm_bios.h header-y += arcfb.h header-y += atalk.h -header-y += atm.h -header-y += atm_eni.h -header-y += atm_he.h -header-y += atm_idt77105.h -header-y += atm_nicstar.h -header-y += atm_tcp.h -header-y += atm_zatm.h header-y += atmapi.h header-y += atmarp.h header-y += atmbr2684.h header-y += atmclip.h header-y += atmdev.h +header-y += atm_eni.h +header-y += atm.h +header-y += atm_he.h +header-y += atm_idt77105.h header-y += atmioc.h header-y += atmlec.h header-y += atmmpc.h +header-y += atm_nicstar.h header-y += atmppp.h header-y += atmsap.h header-y += atmsvc.h +header-y += atm_tcp.h +header-y += atm_zatm.h header-y += audit.h -header-y += auto_fs.h header-y += auto_fs4.h +header-y += auto_fs.h header-y += auxvec.h header-y += ax25.h header-y += b1lli.h @@ -67,8 +67,8 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h -header-y += bpf.h header-y += bpf_common.h +header-y += bpf.h header-y += bpqether.h header-y += bsg.h header-y += btrfs.h @@ -93,21 +93,21 @@ header-y += cyclades.h header-y += cycx_cfm.h header-y += dcbnl.h header-y += dccp.h -header-y += dlm.h +header-y += dlmconstants.h header-y += dlm_device.h +header-y += dlm.h header-y += dlm_netlink.h header-y += dlm_plock.h -header-y += dlmconstants.h header-y += dm-ioctl.h header-y += dm-log-userspace.h header-y += dn.h header-y += dqblk_xfs.h header-y += edd.h header-y += efs_fs_sb.h +header-y += elfcore.h header-y += elf-em.h header-y += elf-fdpic.h header-y += elf.h -header-y += elfcore.h header-y += errno.h header-y += errqueue.h header-y += ethtool.h @@ -131,15 +131,15 @@ header-y += fsl_hypervisor.h header-y += fuse.h header-y += futex.h header-y += gameport.h -header-y += gen_stats.h header-y += genetlink.h +header-y += gen_stats.h header-y += gfs2_ondisk.h header-y += gigaset_dev.h -header-y += hdlc.h header-y += hdlcdrv.h +header-y += hdlc.h header-y += hdreg.h -header-y += hid.h header-y += hiddev.h +header-y += hid.h header-y += hidraw.h header-y += hpet.h header-y += hsr_netlink.h @@ -151,7 +151,6 @@ header-y += i2o-dev.h header-y += i8k.h header-y += icmp.h header-y += icmpv6.h -header-y += if.h header-y += if_addr.h header-y += if_addrlabel.h header-y += if_alg.h @@ -165,6 +164,7 @@ header-y += if_ether.h header-y += if_fc.h header-y += if_fddi.h header-y += if_frad.h +header-y += if.h header-y += if_hippi.h header-y += if_infiniband.h header-y += if_link.h @@ -182,40 +182,40 @@ header-y += if_tunnel.h header-y += if_vlan.h header-y += if_x25.h header-y += igmp.h -header-y += in.h header-y += in6.h -header-y += in_route.h header-y += inet_diag.h +header-y += in.h header-y += inotify.h header-y += input.h +header-y += in_route.h header-y += ioctl.h -header-y += ip.h header-y += ip6_tunnel.h -header-y += ip_vs.h header-y += ipc.h +header-y += ip.h header-y += ipmi.h header-y += ipmi_msgdefs.h header-y += ipsec.h header-y += ipv6.h header-y += ipv6_route.h +header-y += ip_vs.h header-y += ipx.h header-y += irda.h header-y += irqnr.h -header-y += isdn.h header-y += isdn_divertif.h -header-y += isdn_ppp.h +header-y += isdn.h header-y += isdnif.h +header-y += isdn_ppp.h header-y += iso_fs.h -header-y += ivtv.h header-y += ivtvfb.h +header-y += ivtv.h header-y += ixjuser.h header-y += jffs2.h header-y += joystick.h -header-y += kd.h header-y += kdev_t.h -header-y += kernel-page-flags.h -header-y += kernel.h +header-y += kd.h header-y += kernelcapi.h +header-y += kernel.h +header-y += kernel-page-flags.h header-y += kexec.h header-y += keyboard.h header-y += keyctl.h @@ -231,6 +231,7 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h \ header-y += kvm_para.h endif +header-y += hw_breakpoint.h header-y += l2tp.h header-y += libc-compat.h header-y += limits.h @@ -255,43 +256,43 @@ header-y += mman.h header-y += mmtimer.h header-y += mpls.h header-y += mqueue.h -header-y += mroute.h header-y += mroute6.h +header-y += mroute.h header-y += msdos_fs.h header-y += msg.h header-y += mtio.h -header-y += n_r3964.h header-y += nbd.h -header-y += ncp.h header-y += ncp_fs.h +header-y += ncp.h header-y += ncp_mount.h header-y += ncp_no.h header-y += neighbour.h -header-y += net.h -header-y += net_dropmon.h -header-y += net_tstamp.h header-y += netconf.h header-y += netdevice.h -header-y += netlink_diag.h -header-y += netfilter.h +header-y += net_dropmon.h header-y += netfilter_arp.h header-y += netfilter_bridge.h header-y += netfilter_decnet.h +header-y += netfilter.h header-y += netfilter_ipv4.h header-y += netfilter_ipv6.h +header-y += net.h +header-y += netlink_diag.h header-y += netlink.h header-y += netrom.h +header-y += net_tstamp.h header-y += nfc.h -header-y += nfs.h header-y += nfs2.h header-y += nfs3.h header-y += nfs4.h header-y += nfs4_mount.h +header-y += nfsacl.h header-y += nfs_fs.h +header-y += nfs.h header-y += nfs_idmap.h header-y += nfs_mount.h -header-y += nfsacl.h header-y += nl80211.h +header-y += n_r3964.h header-y += nubus.h header-y += nvme.h header-y += nvram.h @@ -311,16 +312,16 @@ header-y += pfkeyv2.h header-y += pg.h header-y += phantom.h header-y += phonet.h +header-y += pktcdvd.h header-y += pkt_cls.h header-y += pkt_sched.h -header-y += pktcdvd.h header-y += pmu.h header-y += poll.h header-y += posix_types.h header-y += ppdev.h header-y += ppp-comp.h -header-y += ppp-ioctl.h header-y += ppp_defs.h +header-y += ppp-ioctl.h header-y += pps.h header-y += prctl.h header-y += psci.h @@ -352,13 +353,13 @@ header-y += seccomp.h header-y += securebits.h header-y += selinux_netlink.h header-y += sem.h -header-y += serial.h header-y += serial_core.h +header-y += serial.h header-y += serial_reg.h header-y += serio.h header-y += shm.h -header-y += signal.h header-y += signalfd.h +header-y += signal.h header-y += smiapp.h header-y += snmp.h header-y += sock_diag.h @@ -367,8 +368,8 @@ header-y += sockios.h header-y += som.h header-y += sonet.h header-y += sonypi.h -header-y += sound.h header-y += soundcard.h +header-y += sound.h header-y += stat.h header-y += stddef.h header-y += string.h @@ -387,11 +388,11 @@ header-y += time.h header-y += times.h header-y += timex.h header-y += tiocl.h -header-y += tipc.h header-y += tipc_config.h +header-y += tipc.h header-y += toshiba.h -header-y += tty.h header-y += tty_flags.h +header-y += tty.h header-y += types.h header-y += udf_fs_i.h header-y += udp.h @@ -437,6 +438,5 @@ header-y += wireless.h header-y += x25.h header-y += xattr.h header-y += xfrm.h -header-y += hw_breakpoint.h header-y += zorro.h header-y += zorro_ids.h -- cgit v1.2.3 From 36cbb2452cbafca64dcdd3578047433787900cf0 Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Thu, 6 Nov 2014 10:37:54 -0800 Subject: udp: Increment UDP_MIB_IGNOREDMULTI for arriving unmatched multicasts As NIC multicast filtering isn't perfect, and some platforms are quite content to spew broadcasts, we should not trigger an event for skb:kfree_skb when we do not have a match for such an incoming datagram. We do though want to avoid sweeping the matter under the rug entirely, so increment a suitable statistic. This incorporates feedback from David L. Stevens, Karl Neiss and Eric Dumazet. V3 - use bool per David Miller Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/udp.c | 12 +++++++++--- net/ipv6/proc.c | 1 + net/ipv6/udp.c | 11 ++++++++--- 5 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df40137f33dd..30f541b32895 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -156,6 +156,7 @@ enum UDP_MIB_RCVBUFERRORS, /* RcvbufErrors */ UDP_MIB_SNDBUFERRORS, /* SndbufErrors */ UDP_MIB_CSUMERRORS, /* InCsumErrors */ + UDP_MIB_IGNOREDMULTI, /* IgnoredMulti */ __UDP_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f0d4eb8b99b9..6513ade8d6dc 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -181,6 +181,7 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index df19027f44f3..5d0fdca8e965 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1647,7 +1647,8 @@ static void udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, - struct udp_table *udptable) + struct udp_table *udptable, + int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; struct hlist_nulls_node *node; @@ -1656,6 +1657,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif = skb->dev->ifindex; unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1674,6 +1676,7 @@ start_lookup: dif, hnum)) { if (unlikely(count == ARRAY_SIZE(stack))) { flush_stack(stack, count, skb, ~0); + inner_flushed = true; count = 0; } stack[count++] = sk; @@ -1695,7 +1698,10 @@ start_lookup: if (count) { flush_stack(stack, count, skb, count - 1); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -1781,7 +1787,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, - saddr, daddr, udptable); + saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk != NULL) { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 1752cd0b4882..679253d0af84 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -136,6 +136,7 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6RcvbufErrors", UDP_MIB_RCVBUFERRORS), SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), + SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_SENTINEL }; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9b6809232b17..b756355e9739 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -771,7 +771,7 @@ static void udp6_csum_zero_error(struct sk_buff *skb) */ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, - struct udp_table *udptable) + struct udp_table *udptable, int proto) { struct sock *sk, *stack[256 / sizeof(struct sock *)]; const struct udphdr *uh = udp_hdr(skb); @@ -781,6 +781,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, int dif = inet6_iif(skb); unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); + bool inner_flushed = false; if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & @@ -803,6 +804,7 @@ start_lookup: (uh->check || udp_sk(sk)->no_check6_rx)) { if (unlikely(count == ARRAY_SIZE(stack))) { flush_stack(stack, count, skb, ~0); + inner_flushed = true; count = 0; } stack[count++] = sk; @@ -821,7 +823,10 @@ start_lookup: if (count) { flush_stack(stack, count, skb, count - 1); } else { - kfree_skb(skb); + if (!inner_flushed) + UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); + consume_skb(skb); } return 0; } @@ -873,7 +878,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, */ if (ipv6_addr_is_multicast(daddr)) return __udp6_lib_mcast_deliver(net, skb, - saddr, daddr, udptable); + saddr, daddr, udptable, proto); /* Unicast */ -- cgit v1.2.3 From ce674173e9f4ef7fd0dc04ea0773cdedfbf8e366 Mon Sep 17 00:00:00 2001 From: Ana Rey Date: Mon, 3 Nov 2014 18:10:50 +0100 Subject: netfilter: nft_meta: add cgroup support This allows you to filter traffic by process control group (cgroup). Signed-off-by: Ana Rey Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_meta.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 16f62a5cf04d..832bc46db78b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -579,6 +579,7 @@ enum nft_exthdr_attributes { * @NFT_META_CPU: cpu id through smp_processor_id() * @NFT_META_IIFGROUP: packet input interface group * @NFT_META_OIFGROUP: packet output interface group + * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid) */ enum nft_meta_keys { NFT_META_LEN, @@ -604,6 +605,7 @@ enum nft_meta_keys { NFT_META_CPU, NFT_META_IIFGROUP, NFT_META_OIFGROUP, + NFT_META_CGROUP, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 1e7c076ca63a..e99911eda915 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -165,6 +165,12 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; dest->data[0] = out->group; break; + case NFT_META_CGROUP: + if (skb->sk == NULL) + break; + + dest->data[0] = skb->sk->sk_classid; + break; default: WARN_ON(1); goto err; @@ -240,6 +246,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_CPU: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: + case NFT_META_CGROUP: break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From 8f0aad6f35f7e8b3118b7b8a65e8e76b135cc4cb Mon Sep 17 00:00:00 2001 From: Wenyu Zhang Date: Thu, 6 Nov 2014 06:51:24 -0800 Subject: openvswitch: Extend packet attribute for egress tunnel info OVS vswitch has extended IPFIX exporter to export tunnel headers to improve network visibility. To export this information userspace needs to know egress tunnel for given packet. By extending packet attributes datapath can export egress tunnel info for given packet. So that userspace can ask for egress tunnel info in userspace action. This information is used to build IPFIX data for given flow. Signed-off-by: Wenyu Zhang Acked-by: Romain Lenglet Acked-by: Ben Pfaff Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 13 +++++++++ net/openvswitch/actions.c | 19 ++++++++++++ net/openvswitch/datapath.c | 21 +++++++++++--- net/openvswitch/datapath.h | 2 ++ net/openvswitch/flow.h | 62 +++++++++++++++++++++++++++++++--------- net/openvswitch/flow_netlink.c | 54 +++++++++++++++++++++++++++------- net/openvswitch/flow_netlink.h | 3 ++ net/openvswitch/vport-geneve.c | 21 +++++++++++++- net/openvswitch/vport-gre.c | 12 +++++++- net/openvswitch/vport-vxlan.c | 24 +++++++++++++++- net/openvswitch/vport.c | 61 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport.h | 14 +++++++++ 12 files changed, 275 insertions(+), 31 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 26c36c4cf7e2..cf8185661e52 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -157,6 +157,11 @@ enum ovs_packet_cmd { * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an * %OVS_USERSPACE_ATTR_USERDATA attribute, with the same length and content * specified there. + * @OVS_PACKET_ATTR_EGRESS_TUN_KEY: Present for an %OVS_PACKET_CMD_ACTION + * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an + * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the + * output port is actually a tunnel port. Contains the output tunnel key + * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -167,6 +172,8 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_KEY, /* Nested OVS_KEY_ATTR_* attributes. */ OVS_PACKET_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ OVS_PACKET_ATTR_USERDATA, /* OVS_ACTION_ATTR_USERSPACE arg. */ + OVS_PACKET_ATTR_EGRESS_TUN_KEY, /* Nested OVS_TUNNEL_KEY_ATTR_* + attributes. */ __OVS_PACKET_ATTR_MAX }; @@ -315,6 +322,8 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */ OVS_TUNNEL_KEY_ATTR_OAM, /* No argument. OAM frame. */ OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, /* Array of Geneve options. */ + OVS_TUNNEL_KEY_ATTR_TP_SRC, /* be16 src Transport Port. */ + OVS_TUNNEL_KEY_ATTR_TP_DST, /* be16 dst Transport Port. */ __OVS_TUNNEL_KEY_ATTR_MAX }; @@ -480,11 +489,15 @@ enum ovs_sample_attr { * message should be sent. Required. * @OVS_USERSPACE_ATTR_USERDATA: If present, its variable-length argument is * copied to the %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA. + * @OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: If present, u32 output port to get + * tunnel info. */ enum ovs_userspace_attr { OVS_USERSPACE_ATTR_UNSPEC, OVS_USERSPACE_ATTR_PID, /* u32 Netlink PID to receive upcalls. */ OVS_USERSPACE_ATTR_USERDATA, /* Optional user-specified cookie. */ + OVS_USERSPACE_ATTR_EGRESS_TUN_PORT, /* Optional, u32 output port + * to get tunnel info. */ __OVS_USERSPACE_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index f7e589159e4a..ceb618cf1292 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -564,6 +564,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr) { + struct ovs_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; @@ -572,6 +573,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, upcall.key = key; upcall.userdata = NULL; upcall.portid = 0; + upcall.egress_tun_info = NULL; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -583,7 +585,24 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, case OVS_USERSPACE_ATTR_PID: upcall.portid = nla_get_u32(a); break; + + case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { + /* Get out tunnel info. */ + struct vport *vport; + + vport = ovs_vport_rcu(dp, nla_get_u32(a)); + if (vport) { + int err; + + err = ovs_vport_get_egress_tun_info(vport, skb, + &info); + if (!err) + upcall.egress_tun_info = &info; + } + break; } + + } /* End of switch. */ } return ovs_dp_upcall(dp, skb, &upcall); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6cfb44f3a7f0..c2ac340e19fb 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -274,6 +274,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) upcall.key = key; upcall.userdata = NULL; upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.egress_tun_info = NULL; error = ovs_dp_upcall(dp, skb, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -375,7 +376,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, return err; } -static size_t upcall_msg_size(const struct nlattr *userdata, +static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, unsigned int hdrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) @@ -383,8 +384,12 @@ static size_t upcall_msg_size(const struct nlattr *userdata, + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ /* OVS_PACKET_ATTR_USERDATA */ - if (userdata) - size += NLA_ALIGN(userdata->nla_len); + if (upcall_info->userdata) + size += NLA_ALIGN(upcall_info->userdata->nla_len); + + /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ + if (upcall_info->egress_tun_info) + size += nla_total_size(ovs_tun_key_attr_size()); return size; } @@ -440,7 +445,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, else hlen = skb->len; - len = upcall_msg_size(upcall_info->userdata, hlen); + len = upcall_msg_size(upcall_info, hlen); user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; @@ -461,6 +466,14 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); + if (upcall_info->egress_tun_info) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); + err = ovs_nla_put_egress_tunnel_key(user_skb, + upcall_info->egress_tun_info); + BUG_ON(err); + nla_nest_end(user_skb, nla); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 1c56a80d6677..2bc577bf9b31 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -114,12 +114,14 @@ struct ovs_skb_cb { * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no * packet is sent and the packet is accounted in the datapath's @n_lost * counter. + * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. */ struct dp_upcall_info { u8 cmd; const struct sw_flow_key *key; const struct nlattr *userdata; u32 portid; + const struct ovs_tunnel_info *egress_tun_info; }; /** diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 4962bee81a11..543b358ee57f 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -37,8 +37,8 @@ struct sk_buff; /* Used to memset ovs_key_ipv4_tunnel padding. */ #define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl)) + (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ + FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) struct ovs_key_ipv4_tunnel { __be64 tun_id; @@ -47,6 +47,8 @@ struct ovs_key_ipv4_tunnel { __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; + __be16 tp_src; + __be16 tp_dst; } __packed __aligned(4); /* Minimize padding. */ struct ovs_tunnel_info { @@ -64,27 +66,59 @@ struct ovs_tunnel_info { FIELD_SIZEOF(struct sw_flow_key, tun_opts) - \ opt_len)) -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be64 tun_id, __be16 tun_flags, - struct geneve_opt *opts, - u8 opts_len) +static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, + __be16 tp_dst, + __be64 tun_id, + __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) { tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = iph->saddr; - tun_info->tunnel.ipv4_dst = iph->daddr; - tun_info->tunnel.ipv4_tos = iph->tos; - tun_info->tunnel.ipv4_ttl = iph->ttl; + tun_info->tunnel.ipv4_src = saddr; + tun_info->tunnel.ipv4_dst = daddr; + tun_info->tunnel.ipv4_tos = tos; + tun_info->tunnel.ipv4_ttl = ttl; tun_info->tunnel.tun_flags = tun_flags; - /* clear struct padding. */ - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + tun_info->tunnel.tp_src = tp_src; + tun_info->tunnel.tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) + memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, + 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); tun_info->options = opts; tun_info->options_len = opts_len; } +static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, + const struct iphdr *iph, + __be16 tp_src, + __be16 tp_dst, + __be64 tun_id, + __be16 tun_flags, + struct geneve_opt *opts, + u8 opts_len) +{ + __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, + iph->tos, iph->ttl, + tp_src, tp_dst, + tun_id, tun_flags, + opts, opts_len); +} + +#define OVS_SW_FLOW_KEY_METADATA_SIZE \ + (offsetof(struct sw_flow_key, recirc_id) + \ + FIELD_SIZEOF(struct sw_flow_key, recirc_id)) + struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ed3109761827..98a3e96b7d93 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -245,6 +245,24 @@ static bool match_validate(const struct sw_flow_match *match, return true; } +size_t ovs_tun_key_attr_size(void) +{ + /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ + + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ +} + size_t ovs_key_attr_size(void) { /* Whenever adding new OVS_KEY_ FIELDS, we should consider @@ -254,15 +272,7 @@ size_t ovs_key_attr_size(void) return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ - + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ - + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + + ovs_tun_key_attr_size() + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ @@ -393,6 +403,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, [OVS_TUNNEL_KEY_ATTR_TTL] = 1, [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = sizeof(u16), + [OVS_TUNNEL_KEY_ATTR_TP_DST] = sizeof(u16), [OVS_TUNNEL_KEY_ATTR_OAM] = 0, [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = -1, }; @@ -440,6 +452,14 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, case OVS_TUNNEL_KEY_ATTR_CSUM: tun_flags |= TUNNEL_CSUM; break; + case OVS_TUNNEL_KEY_ATTR_TP_SRC: + SW_FLOW_KEY_PUT(match, tun_key.tp_src, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TP_DST: + SW_FLOW_KEY_PUT(match, tun_key.tp_dst, + nla_get_be16(a), is_mask); + break; case OVS_TUNNEL_KEY_ATTR_OAM: tun_flags |= TUNNEL_OAM; break; @@ -548,6 +568,12 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if ((output->tun_flags & TUNNEL_CSUM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) return -EMSGSIZE; + if (output->tp_src && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src)) + return -EMSGSIZE; + if (output->tp_dst && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) + return -EMSGSIZE; if ((output->tun_flags & TUNNEL_OAM) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) return -EMSGSIZE; @@ -559,7 +585,6 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } - static int ipv4_tun_to_nlattr(struct sk_buff *skb, const struct ovs_key_ipv4_tunnel *output, const struct geneve_opt *tun_opts, @@ -580,6 +605,14 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, return 0; } +int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, + const struct ovs_tunnel_info *egress_tun_info) +{ + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, + egress_tun_info->options, + egress_tun_info->options_len); +} + static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, const struct nlattr **a, bool is_mask) { @@ -1653,6 +1686,7 @@ static int validate_userspace(const struct nlattr *attr) static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 }, }; struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index eb0b177300ad..90bbe3785504 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -37,6 +37,7 @@ #include "flow.h" +size_t ovs_tun_key_attr_size(void); size_t ovs_key_attr_size(void); void ovs_match_init(struct sw_flow_match *match, @@ -49,6 +50,8 @@ int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *); int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *, const struct nlattr *); +int ovs_nla_put_egress_tunnel_key(struct sk_buff *, + const struct ovs_tunnel_info *); int ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 70c9765011f4..e31f19c922e2 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -97,7 +97,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) key = vni_to_tunnel_id(geneveh->vni); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, flags, + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, geneveh->options, opts_len); ovs_vport_receive(vport, skb, &tun_info); @@ -228,6 +230,22 @@ static const char *geneve_get_name(const struct vport *vport) return geneve_port->name; } +static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + + /* Get tp_src and tp_dst, refert to geneve_build_header(). + */ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, sport, dport); +} + static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, .create = geneve_tnl_create, @@ -236,6 +254,7 @@ static struct vport_ops ovs_geneve_vport_ops = { .get_options = geneve_get_options, .send = geneve_tnl_send, .owner = THIS_MODULE, + .get_egress_tun_info = geneve_get_egress_tun_info, }; static int __init ovs_geneve_tnl_init(void) diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 00270b608844..8e61a5c6ae7c 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -108,7 +108,7 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), key, + ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); @@ -284,12 +284,22 @@ static void gre_tnl_destroy(struct vport *vport) gre_exit(); } +static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_GRE, skb->mark, 0, 0); +} + static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, .destroy = gre_tnl_destroy, .get_name = gre_get_name, .send = gre_tnl_send, + .get_egress_tun_info = gre_get_egress_tun_info, .owner = THIS_MODULE, }; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 965e7500c5a6..38f95a52241b 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -69,7 +69,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(vx_vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, key, TUNNEL_KEY, NULL, 0); + ovs_flow_tun_info_init(&tun_info, iph, + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, TUNNEL_KEY, NULL, 0); ovs_vport_receive(vport, skb, &tun_info); } @@ -189,6 +191,25 @@ error: return err; } +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *egress_tun_info) +{ + struct net *net = ovs_dp_get_net(vport->dp); + struct vxlan_port *vxlan_port = vxlan_vport(vport); + __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + static const char *vxlan_get_name(const struct vport *vport) { struct vxlan_port *vxlan_port = vxlan_vport(vport); @@ -202,6 +223,7 @@ static struct vport_ops ovs_vxlan_vport_ops = { .get_name = vxlan_get_name, .get_options = vxlan_get_options, .send = vxlan_tnl_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, .owner = THIS_MODULE, }; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 4b5dd18953a6..630e81984b65 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -573,3 +573,64 @@ void ovs_vport_deferred_free(struct vport *vport) call_rcu(&vport->rcu, free_vport_rcu); } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); + +int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, + struct net *net, + const struct ovs_tunnel_info *tun_info, + u8 ipproto, + u32 skb_mark, + __be16 tp_src, + __be16 tp_dst) +{ + const struct ovs_key_ipv4_tunnel *tun_key; + struct rtable *rt; + struct flowi4 fl; + + if (unlikely(!tun_info)) + return -EINVAL; + + tun_key = &tun_info->tunnel; + + /* Route lookup to get srouce IP address. + * The process may need to be changed if the corresponding process + * in vports ops changed. + */ + memset(&fl, 0, sizeof(fl)); + fl.daddr = tun_key->ipv4_dst; + fl.saddr = tun_key->ipv4_src; + fl.flowi4_tos = RT_TOS(tun_key->ipv4_tos); + fl.flowi4_mark = skb_mark; + fl.flowi4_proto = IPPROTO_GRE; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + + /* Generate egress_tun_info based on tun_info, + * saddr, tp_src and tp_dst + */ + __ovs_flow_tun_info_init(egress_tun_info, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, + tun_key->ipv4_ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags, + tun_info->options, + tun_info->options_len); + + return 0; +} +EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); + +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *info) +{ + /* get_egress_tun_info() is only implemented on tunnel ports. */ + if (unlikely(!vport->ops->get_egress_tun_info)) + return -EINVAL; + + return vport->ops->get_egress_tun_info(vport, skb, info); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e41c3facf799..0635d1d761e9 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -58,6 +58,16 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); int ovs_vport_send(struct vport *, struct sk_buff *); +int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, + struct net *net, + const struct ovs_tunnel_info *tun_info, + u8 ipproto, + u32 skb_mark, + __be16 tp_src, + __be16 tp_dst); +int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ovs_tunnel_info *info); + /* The following definitions are for implementers of vport devices: */ struct vport_err_stats { @@ -146,6 +156,8 @@ struct vport_parms { * @get_name: Get the device's name. * @send: Send a packet on the device. Returns the length of the packet sent, * zero for dropped packets or negative for error. + * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for + * a packet. */ struct vport_ops { enum ovs_vport_type type; @@ -161,6 +173,8 @@ struct vport_ops { const char *(*get_name)(const struct vport *); int (*send)(struct vport *, struct sk_buff *); + int (*get_egress_tun_info)(struct vport *, struct sk_buff *, + struct ovs_tunnel_info *); struct module *owner; struct list_head list; -- cgit v1.2.3 From 05da5898a96c05e32aa9850c9cd89eef29471b13 Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Thu, 6 Nov 2014 07:03:05 -0800 Subject: openvswitch: Add support for OVS_FLOW_ATTR_PROBE. This new flag is useful for suppressing error logging while probing for datapath features using flow commands. For backwards compatibility reasons the commands are executed normally, but error logging is suppressed. Signed-off-by: Jarno Rajahalme Signed-off-by: Pravin B Shelar --- include/uapi/linux/openvswitch.h | 2 + net/openvswitch/datapath.c | 49 ++++--- net/openvswitch/datapath.h | 6 +- net/openvswitch/flow.c | 4 +- net/openvswitch/flow.h | 2 +- net/openvswitch/flow_netlink.c | 303 +++++++++++++++++++++------------------ net/openvswitch/flow_netlink.h | 10 +- 7 files changed, 208 insertions(+), 168 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index cf8185661e52..3a6dcaa359b7 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -457,6 +457,8 @@ enum ovs_flow_attr { OVS_FLOW_ATTR_USED, /* u64 msecs last used in monotonic time. */ OVS_FLOW_ATTR_CLEAR, /* Flag to clear stats, tcp_flags, used. */ OVS_FLOW_ATTR_MASK, /* Sequence of OVS_KEY_ATTR_* attributes. */ + OVS_FLOW_ATTR_PROBE, /* Flow operation is a feature probe, error + * logging should be suppressed. */ __OVS_FLOW_ATTR_MAX }; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 65561ebb489e..ab141d49bb9d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -526,6 +526,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct vport *input_vport; int len; int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || @@ -559,12 +560,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_kfree_skb; err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, - &flow->key); + &flow->key, log); if (err) goto err_flow_free; err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], - &flow->key, &acts); + &flow->key, &acts, log); if (err) goto err_flow_free; @@ -855,15 +856,16 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *acts; struct sw_flow_match match; int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Must have key and actions. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR("Flow key attribute not present in new flow.\n"); + OVS_NLERR(log, "Flow key attr not present in new flow."); goto error; } if (!a[OVS_FLOW_ATTR_ACTIONS]) { - OVS_NLERR("Flow actions attribute not present in new flow.\n"); + OVS_NLERR(log, "Flow actions attr not present in new flow."); goto error; } @@ -878,8 +880,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ ovs_match_init(&match, &new_flow->unmasked_key, &mask); - error = ovs_nla_get_match(&match, - a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; @@ -887,9 +889,9 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Validate actions. */ error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - &acts); + &acts, log); if (error) { - OVS_NLERR("Flow actions may not be safe on all matching packets.\n"); + OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_flow; } @@ -942,6 +944,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* The unmasked key has to be the same for flow updates. */ if (unlikely(!ovs_flow_cmp_unmasked_key(flow, &match))) { + /* Look for any overlapping flow. */ flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { error = -ENOENT; @@ -984,16 +987,18 @@ error: /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, const struct sw_flow_key *key, - const struct sw_flow_mask *mask) + const struct sw_flow_mask *mask, + bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; int error; ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, &acts); + error = ovs_nla_copy_actions(a, &masked_key, &acts, log); if (error) { - OVS_NLERR("Actions may not be safe on all matching packets.\n"); + OVS_NLERR(log, + "Actions may not be safe on all matching packets"); return ERR_PTR(error); } @@ -1012,23 +1017,25 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sw_flow_actions *old_acts = NULL, *acts = NULL; struct sw_flow_match match; int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; /* Extract key. */ error = -EINVAL; if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR("Flow key attribute not present in set flow.\n"); + OVS_NLERR(log, "Flow key attribute not present in set flow."); goto error; } ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, - a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK]); + error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask); + acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, + log); if (IS_ERR(acts)) { error = PTR_ERR(acts); goto error; @@ -1109,14 +1116,16 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct sw_flow_match match; int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR("Flow get message rejected, Key attribute missing.\n"); + OVS_NLERR(log, + "Flow get message rejected, Key attribute missing."); return -EINVAL; } ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, log); if (err) return err; @@ -1157,10 +1166,12 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct sw_flow_match match; int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; if (likely(a[OVS_FLOW_ATTR_KEY])) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL); + err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + log); if (unlikely(err)) return err; } @@ -1250,8 +1261,10 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, }; static const struct genl_ops dp_flow_genl_ops[] = { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 8389c1d68e57..3ece94563079 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -199,9 +199,9 @@ void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); -#define OVS_NLERR(fmt, ...) \ +#define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ - if (net_ratelimit()) \ - pr_info("netlink: " fmt, ##__VA_ARGS__); \ + if (logging_allowed && net_ratelimit()) \ + pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ } while (0) #endif /* datapath.h */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 25e9abcd51f1..70bef2ab7f2b 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -712,12 +712,12 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, - struct sw_flow_key *key) + struct sw_flow_key *key, bool log) { int err; /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(attr, key); + err = ovs_nla_get_flow_metadata(attr, key, log); if (err) return err; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 9e0a787c8627..a8b30f334388 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -257,6 +257,6 @@ int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, /* Extract key from packet coming from userspace. */ int ovs_flow_key_extract_userspace(const struct nlattr *attr, struct sk_buff *skb, - struct sw_flow_key *key); + struct sw_flow_key *key, bool log); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 98a3e96b7d93..c0d066def228 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -112,7 +112,7 @@ static void update_range(struct sw_flow_match *match, } while (0) static bool match_validate(const struct sw_flow_match *match, - u64 key_attrs, u64 mask_attrs) + u64 key_attrs, u64 mask_attrs, bool log) { u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; u64 mask_allowed = key_attrs; /* At most allow all key attributes */ @@ -230,15 +230,17 @@ static bool match_validate(const struct sw_flow_match *match, if ((key_attrs & key_expected) != key_expected) { /* Key attributes check failed. */ - OVS_NLERR("Missing expected key attributes (key_attrs=%llx, expected=%llx).\n", - (unsigned long long)key_attrs, (unsigned long long)key_expected); + OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", + (unsigned long long)key_attrs, + (unsigned long long)key_expected); return false; } if ((mask_attrs & mask_allowed) != mask_attrs) { /* Mask attributes check failed. */ - OVS_NLERR("Contain more than allowed mask fields (mask_attrs=%llx, mask_allowed=%llx).\n", - (unsigned long long)mask_attrs, (unsigned long long)mask_allowed); + OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)", + (unsigned long long)mask_attrs, + (unsigned long long)mask_allowed); return false; } @@ -328,7 +330,7 @@ static bool is_all_zero(const u8 *fp, size_t size) static int __parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], - u64 *attrsp, bool nz) + u64 *attrsp, bool log, bool nz) { const struct nlattr *nla; u64 attrs; @@ -340,21 +342,20 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, int expected_len; if (type > OVS_KEY_ATTR_MAX) { - OVS_NLERR("Unknown key attribute (type=%d, max=%d).\n", + OVS_NLERR(log, "Key type %d is out of range max %d", type, OVS_KEY_ATTR_MAX); return -EINVAL; } if (attrs & (1 << type)) { - OVS_NLERR("Duplicate key attribute (type %d).\n", type); + OVS_NLERR(log, "Duplicate key (type %d).", type); return -EINVAL; } expected_len = ovs_key_lens[type]; if (nla_len(nla) != expected_len && expected_len != -1) { - OVS_NLERR("Key attribute has unexpected length (type=%d" - ", length=%d, expected=%d).\n", type, - nla_len(nla), expected_len); + OVS_NLERR(log, "Key %d has unexpected len %d expected %d", + type, nla_len(nla), expected_len); return -EINVAL; } @@ -364,7 +365,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } } if (rem) { - OVS_NLERR("Message has %d unknown bytes.\n", rem); + OVS_NLERR(log, "Message has %d unknown bytes.", rem); return -EINVAL; } @@ -373,28 +374,84 @@ static int __parse_flow_nlattrs(const struct nlattr *attr, } static int parse_flow_mask_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) + const struct nlattr *a[], u64 *attrsp, + bool log) { - return __parse_flow_nlattrs(attr, a, attrsp, true); + return __parse_flow_nlattrs(attr, a, attrsp, log, true); } static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp) + const struct nlattr *a[], u64 *attrsp, + bool log) { - return __parse_flow_nlattrs(attr, a, attrsp, false); + return __parse_flow_nlattrs(attr, a, attrsp, log, false); +} + +static int genev_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "Geneve option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR(log, "Geneve option len %d != mask len %d", + match->key->tun_opts_len, nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + } + + opt_key_offset = (unsigned long)GENEVE_OPTS((struct sw_flow_key *)0, + nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; } static int ipv4_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask) + struct sw_flow_match *match, bool is_mask, + bool log) { struct nlattr *a; int rem; bool ttl = false; __be16 tun_flags = 0; - unsigned long opt_key_offset; nla_for_each_nested(a, attr, rem) { int type = nla_type(a); + int err; + static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), @@ -410,15 +467,14 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, }; if (type > OVS_TUNNEL_KEY_ATTR_MAX) { - OVS_NLERR("Unknown IPv4 tunnel attribute (type=%d, max=%d).\n", - type, OVS_TUNNEL_KEY_ATTR_MAX); + OVS_NLERR(log, "Tunnel attr %d out of range max %d", + type, OVS_TUNNEL_KEY_ATTR_MAX); return -EINVAL; } if (ovs_tunnel_key_lens[type] != nla_len(a) && ovs_tunnel_key_lens[type] != -1) { - OVS_NLERR("IPv4 tunnel attribute type has unexpected " - " length (type=%d, length=%d, expected=%d).\n", + OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", type, nla_len(a), ovs_tunnel_key_lens[type]); return -EINVAL; } @@ -464,58 +520,14 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_OAM; break; case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: - tun_flags |= TUNNEL_OPTIONS_PRESENT; - if (nla_len(a) > sizeof(match->key->tun_opts)) { - OVS_NLERR("Geneve option length exceeds maximum size (len %d, max %zu).\n", - nla_len(a), - sizeof(match->key->tun_opts)); - return -EINVAL; - } - - if (nla_len(a) % 4 != 0) { - OVS_NLERR("Geneve option length is not a multiple of 4 (len %d).\n", - nla_len(a)); - return -EINVAL; - } - - /* We need to record the length of the options passed - * down, otherwise packets with the same format but - * additional options will be silently matched. - */ - if (!is_mask) { - SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), - false); - } else { - /* This is somewhat unusual because it looks at - * both the key and mask while parsing the - * attributes (and by extension assumes the key - * is parsed first). Normally, we would verify - * that each is the correct length and that the - * attributes line up in the validate function. - * However, that is difficult because this is - * variable length and we won't have the - * information later. - */ - if (match->key->tun_opts_len != nla_len(a)) { - OVS_NLERR("Geneve option key length (%d) is different from mask length (%d).", - match->key->tun_opts_len, - nla_len(a)); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, - true); - } + err = genev_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; - opt_key_offset = (unsigned long)GENEVE_OPTS( - (struct sw_flow_key *)0, - nla_len(a)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, - nla_data(a), nla_len(a), - is_mask); + tun_flags |= TUNNEL_OPTIONS_PRESENT; break; default: - OVS_NLERR("Unknown IPv4 tunnel attribute (%d).\n", + OVS_NLERR(log, "Unknown IPv4 tunnel attribute %d", type); return -EINVAL; } @@ -524,18 +536,19 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); if (rem > 0) { - OVS_NLERR("IPv4 tunnel attribute has %d unknown bytes.\n", rem); + OVS_NLERR(log, "IPv4 tunnel attribute has %d unknown bytes.", + rem); return -EINVAL; } if (!is_mask) { if (!match->key->tun_key.ipv4_dst) { - OVS_NLERR("IPv4 tunnel destination address is zero.\n"); + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } if (!ttl) { - OVS_NLERR("IPv4 tunnel TTL not specified.\n"); + OVS_NLERR(log, "IPv4 tunnel TTL not specified."); return -EINVAL; } } @@ -614,7 +627,8 @@ int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, } static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask) + const struct nlattr **a, bool is_mask, + bool log) { if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -642,7 +656,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, if (is_mask) { in_port = 0xffffffff; /* Always exact match in_port. */ } else if (in_port >= DP_MAX_PORTS) { - OVS_NLERR("Port (%d) exceeds maximum allowable (%d).\n", + OVS_NLERR(log, "Port %d exceeds max allowable %d", in_port, DP_MAX_PORTS); return -EINVAL; } @@ -661,7 +675,7 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, } if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { if (ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask)) + is_mask, log)) return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } @@ -669,11 +683,12 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, } static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask) + const struct nlattr **a, bool is_mask, + bool log) { int err; - err = metadata_from_nlattrs(match, &attrs, a, is_mask); + err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); if (err) return err; @@ -694,9 +709,9 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); if (!(tci & htons(VLAN_TAG_PRESENT))) { if (is_mask) - OVS_NLERR("VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit.\n"); + OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); else - OVS_NLERR("VLAN TCI does not have VLAN_TAG_PRESENT bit set.\n"); + OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); return -EINVAL; } @@ -713,8 +728,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, /* Always exact match EtherType. */ eth_type = htons(0xffff); } else if (ntohs(eth_type) < ETH_P_802_3_MIN) { - OVS_NLERR("EtherType is less than minimum (type=%x, min=%x).\n", - ntohs(eth_type), ETH_P_802_3_MIN); + OVS_NLERR(log, "EtherType %x is less than min %x", + ntohs(eth_type), ETH_P_802_3_MIN); return -EINVAL; } @@ -729,8 +744,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv4 fragment type (value=%d, max=%d).\n", - ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + OVS_NLERR(log, "IPv4 frag type %d is out of range max %d", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; } SW_FLOW_KEY_PUT(match, ip.proto, @@ -753,8 +768,8 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR("Unknown IPv6 fragment type (value=%d, max=%d).\n", - ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + OVS_NLERR(log, "IPv6 frag type %d is out of range max %d", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); return -EINVAL; } SW_FLOW_KEY_PUT(match, ipv6.label, @@ -784,7 +799,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); if (!is_mask && (arp_key->arp_op & htons(0xff00))) { - OVS_NLERR("Unknown ARP opcode (opcode=%d).\n", + OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).", arp_key->arp_op); return -EINVAL; } @@ -885,7 +900,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, } if (attrs != 0) { - OVS_NLERR("Unknown key attributes (%llx).\n", + OVS_NLERR(log, "Unknown key attributes %llx", (unsigned long long)attrs); return -EINVAL; } @@ -926,10 +941,14 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * of this flow. * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink * attribute specifies the mask field of the wildcarded flow. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. */ int ovs_nla_get_match(struct sw_flow_match *match, const struct nlattr *nla_key, - const struct nlattr *nla_mask) + const struct nlattr *nla_mask, + bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; const struct nlattr *encap; @@ -939,7 +958,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, bool encap_valid = false; int err; - err = parse_flow_nlattrs(nla_key, a, &key_attrs); + err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); if (err) return err; @@ -950,7 +969,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR("Invalid Vlan frame.\n"); + OVS_NLERR(log, "Invalid Vlan frame."); return -EINVAL; } @@ -961,22 +980,22 @@ int ovs_nla_get_match(struct sw_flow_match *match, encap_valid = true; if (tci & htons(VLAN_TAG_PRESENT)) { - err = parse_flow_nlattrs(encap, a, &key_attrs); + err = parse_flow_nlattrs(encap, a, &key_attrs, log); if (err) return err; } else if (!tci) { /* Corner case for truncated 802.1Q header. */ if (nla_len(encap)) { - OVS_NLERR("Truncated 802.1Q header has non-zero encap attribute.\n"); + OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); return -EINVAL; } } else { - OVS_NLERR("Encap attribute is set for a non-VLAN frame.\n"); + OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); return -EINVAL; } } - err = ovs_key_from_nlattrs(match, key_attrs, a, false); + err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); if (err) return err; @@ -1010,7 +1029,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, nla_mask = newmask; } - err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs); + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log); if (err) goto free_newmask; @@ -1022,7 +1041,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, __be16 tci = 0; if (!encap_valid) { - OVS_NLERR("Encap mask attribute is set for non-VLAN frame.\n"); + OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); err = -EINVAL; goto free_newmask; } @@ -1034,12 +1053,13 @@ int ovs_nla_get_match(struct sw_flow_match *match, if (eth_type == htons(0xffff)) { mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); encap = a[OVS_KEY_ATTR_ENCAP]; - err = parse_flow_mask_nlattrs(encap, a, &mask_attrs); + err = parse_flow_mask_nlattrs(encap, a, + &mask_attrs, log); if (err) goto free_newmask; } else { - OVS_NLERR("VLAN frames must have an exact match on the TPID (mask=%x).\n", - ntohs(eth_type)); + OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", + ntohs(eth_type)); err = -EINVAL; goto free_newmask; } @@ -1048,18 +1068,19 @@ int ovs_nla_get_match(struct sw_flow_match *match, tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); if (!(tci & htons(VLAN_TAG_PRESENT))) { - OVS_NLERR("VLAN tag present bit must have an exact match (tci_mask=%x).\n", ntohs(tci)); + OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", + ntohs(tci)); err = -EINVAL; goto free_newmask; } } - err = ovs_key_from_nlattrs(match, mask_attrs, a, true); + err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); if (err) goto free_newmask; } - if (!match_validate(match, key_attrs, mask_attrs)) + if (!match_validate(match, key_attrs, mask_attrs, log)) err = -EINVAL; free_newmask: @@ -1072,6 +1093,9 @@ free_newmask: * @key: Receives extracted in_port, priority, tun_key and skb_mark. * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. * * This parses a series of Netlink attributes that form a flow key, which must * take the same form accepted by flow_from_nlattrs(), but only enough of it to @@ -1080,14 +1104,15 @@ free_newmask: */ int ovs_nla_get_flow_metadata(const struct nlattr *attr, - struct sw_flow_key *key) + struct sw_flow_key *key, + bool log) { const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; struct sw_flow_match match; u64 attrs = 0; int err; - err = parse_flow_nlattrs(attr, a, &attrs); + err = parse_flow_nlattrs(attr, a, &attrs, log); if (err) return -EINVAL; @@ -1096,7 +1121,7 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, key->phy.in_port = DP_MAX_PORTS; - return metadata_from_nlattrs(&match, &attrs, a, false); + return metadata_from_nlattrs(&match, &attrs, a, false, log); } int ovs_nla_put_flow(const struct sw_flow_key *swkey, @@ -1316,12 +1341,12 @@ nla_put_failure: #define MAX_ACTIONS_BUFSIZE (32 * 1024) -static struct sw_flow_actions *nla_alloc_flow_actions(int size) +static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) { struct sw_flow_actions *sfa; if (size > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR("Flow action size (%u bytes) exceeds maximum", size); + OVS_NLERR(log, "Flow action size %u bytes exceeds max", size); return ERR_PTR(-EINVAL); } @@ -1341,7 +1366,7 @@ void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, - int attr_len) + int attr_len, bool log) { struct sw_flow_actions *acts; @@ -1361,7 +1386,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = nla_alloc_flow_actions(new_acts_size); + acts = nla_alloc_flow_actions(new_acts_size, log); if (IS_ERR(acts)) return (void *)acts; @@ -1376,11 +1401,11 @@ out: } static struct nlattr *__add_action(struct sw_flow_actions **sfa, - int attrtype, void *data, int len) + int attrtype, void *data, int len, bool log) { struct nlattr *a; - a = reserve_sfa_size(sfa, nla_attr_size(len)); + a = reserve_sfa_size(sfa, nla_attr_size(len), log); if (IS_ERR(a)) return a; @@ -1395,11 +1420,11 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa, } static int add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len) + void *data, int len, bool log) { struct nlattr *a; - a = __add_action(sfa, attrtype, data, len); + a = __add_action(sfa, attrtype, data, len, log); if (IS_ERR(a)) return PTR_ERR(a); @@ -1407,12 +1432,12 @@ static int add_action(struct sw_flow_actions **sfa, int attrtype, } static inline int add_nested_action_start(struct sw_flow_actions **sfa, - int attrtype) + int attrtype, bool log) { int used = (*sfa)->actions_len; int err; - err = add_action(sfa, attrtype, NULL, 0); + err = add_action(sfa, attrtype, NULL, 0, log); if (err) return err; @@ -1431,12 +1456,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci); + __be16 eth_type, __be16 vlan_tci, bool log); static int validate_and_copy_sample(const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci) + __be16 eth_type, __be16 vlan_tci, bool log) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -1462,19 +1487,19 @@ static int validate_and_copy_sample(const struct nlattr *attr, return -EINVAL; /* validation done, copy sample action. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32)); + nla_data(probability), sizeof(u32), log); if (err) return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); if (st_acts < 0) return st_acts; err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, - eth_type, vlan_tci); + eth_type, vlan_tci, log); if (err) return err; @@ -1511,7 +1536,7 @@ void ovs_match_init(struct sw_flow_match *match, } static int validate_and_copy_set_tun(const struct nlattr *attr, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, bool log) { struct sw_flow_match match; struct sw_flow_key key; @@ -1520,7 +1545,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, int err, start; ovs_match_init(&match, &key, NULL); - err = ipv4_tun_from_nlattr(nla_data(attr), &match, false); + err = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log); if (err) return err; @@ -1549,12 +1574,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, key.tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; }; - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); if (start < 0) return start; a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len); + sizeof(*tun_info) + key.tun_opts_len, log); if (IS_ERR(a)) return PTR_ERR(a); @@ -1582,7 +1607,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun, __be16 eth_type) + bool *set_tun, __be16 eth_type, bool log) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -1611,7 +1636,7 @@ static int validate_set(const struct nlattr *a, return -EINVAL; *set_tun = true; - err = validate_and_copy_set_tun(a, sfa); + err = validate_and_copy_set_tun(a, sfa, log); if (err) return err; break; @@ -1704,12 +1729,12 @@ static int validate_userspace(const struct nlattr *attr) } static int copy_action(const struct nlattr *from, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, bool log) { int totlen = NLA_ALIGN(from->nla_len); struct nlattr *to; - to = reserve_sfa_size(sfa, from->nla_len); + to = reserve_sfa_size(sfa, from->nla_len, log); if (IS_ERR(to)) return PTR_ERR(to); @@ -1720,7 +1745,7 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci) + __be16 eth_type, __be16 vlan_tci, bool log) { const struct nlattr *a; bool out_tnl_port = false; @@ -1843,7 +1868,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_SET: err = validate_set(a, key, sfa, - &out_tnl_port, eth_type); + &out_tnl_port, eth_type, log); if (err) return err; @@ -1852,18 +1877,18 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, case OVS_ACTION_ATTR_SAMPLE: err = validate_and_copy_sample(a, key, depth, sfa, - eth_type, vlan_tci); + eth_type, vlan_tci, log); if (err) return err; skip_copy = true; break; default: - OVS_NLERR("Unknown tunnel attribute (%d).\n", type); + OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; } if (!skip_copy) { - err = copy_action(a, sfa); + err = copy_action(a, sfa, log); if (err) return err; } @@ -1877,16 +1902,16 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, int ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, bool log) { int err; - *sfa = nla_alloc_flow_actions(nla_len(attr)); + *sfa = nla_alloc_flow_actions(nla_len(attr), log); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, - key->eth.tci); + key->eth.tci, log); if (err) kfree(*sfa); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 90bbe3785504..577f12be3459 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -45,17 +45,17 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_flow(const struct sw_flow_key *, const struct sw_flow_key *, struct sk_buff *); -int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *); +int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, + bool log); -int ovs_nla_get_match(struct sw_flow_match *match, - const struct nlattr *, - const struct nlattr *); +int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, + const struct nlattr *mask, bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, const struct ovs_tunnel_info *); int ovs_nla_copy_actions(const struct nlattr *attr, const struct sw_flow_key *key, - struct sw_flow_actions **sfa); + struct sw_flow_actions **sfa, bool log); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); -- cgit v1.2.3 From f8d7552e945d38bd8d2e9c23aebf98042ce12302 Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Fri, 7 Nov 2014 14:31:35 +0200 Subject: cfg80211: add channel switch started notification Add a new NL80211_CH_SWITCH_STARTED_NOTIFY message that can be sent to the userspace when a channel switch process has started. This allows userspace to take action, for instance, by requesting other interfaces to switch channel as necessary. This patch introduces a function that allows the drivers to send this notification. It should be used when the driver starts processing a channel switch initiated by a remote device (eg. when a STA receives a CSA from the AP) and when it successfully starts a userspace-triggered channel switch (eg. when hostapd triggers a channel swith in the AP). Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ include/uapi/linux/nl80211.h | 11 +++++++++++ net/wireless/nl80211.c | 28 +++++++++++++++++++++++++--- net/wireless/trace.h | 16 ++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5c3acd07acd9..220d5f5f1aca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4719,6 +4719,20 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy, void cfg80211_ch_switch_notify(struct net_device *dev, struct cfg80211_chan_def *chandef); +/* + * cfg80211_ch_switch_started_notify - notify channel switch start + * @dev: the device on which the channel switch started + * @chandef: the future channel definition + * @count: the number of TBTTs until the channel switch happens + * + * Inform the userspace about the channel switch that has just + * started, so that it can take appropriate actions (eg. starting + * channel switch on other vifs), if necessary. + */ +void cfg80211_ch_switch_started_notify(struct net_device *dev, + struct cfg80211_chan_def *chandef, + u8 count); + /** * ieee80211_operating_class_to_band - convert operating class to band * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9b3025e4377a..354163433352 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -645,6 +645,15 @@ * %NL80211_ATTR_IFINDEX is now on %NL80211_ATTR_WIPHY_FREQ and the * attributes determining channel width. * + * @NL80211_CMD_CH_SWITCH_STARTED_NOTIFY: Notify that a channel switch + * has been started on an interface, regardless of the initiator + * (ie. whether it was requested from a remote device or + * initiated on our own). It indicates that + * %NL80211_ATTR_IFINDEX will be on %NL80211_ATTR_WIPHY_FREQ + * after %NL80211_ATTR_CH_SWITCH_COUNT TBTT's. The userspace may + * decide to react to this indication by requesting other + * interfaces to change channel as well. + * * @NL80211_CMD_START_P2P_DEVICE: Start the given P2P Device, identified by * its %NL80211_ATTR_WDEV identifier. It must have been created with * %NL80211_CMD_NEW_INTERFACE previously. After it has been started, the @@ -930,6 +939,8 @@ enum nl80211_commands { NL80211_CMD_JOIN_OCB, NL80211_CMD_LEAVE_OCB, + NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 24549cbe0b54..24fd2925b281 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11653,7 +11653,9 @@ EXPORT_SYMBOL(cfg80211_pmksa_candidate_notify); static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_chan_def *chandef, - gfp_t gfp) + gfp_t gfp, + enum nl80211_commands notif, + u8 count) { struct sk_buff *msg; void *hdr; @@ -11662,7 +11664,7 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CH_SWITCH_NOTIFY); + hdr = nl80211hdr_put(msg, 0, 0, 0, notif); if (!hdr) { nlmsg_free(msg); return; @@ -11674,6 +11676,10 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev, if (nl80211_send_chandef(msg, chandef)) goto nla_put_failure; + if ((notif == NL80211_CMD_CH_SWITCH_STARTED_NOTIFY) && + (nla_put_u32(msg, NL80211_ATTR_CH_SWITCH_COUNT, count))) + goto nla_put_failure; + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, @@ -11704,10 +11710,26 @@ void cfg80211_ch_switch_notify(struct net_device *dev, wdev->chandef = *chandef; wdev->preset_chandef = *chandef; - nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL); + nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, + NL80211_CMD_CH_SWITCH_NOTIFY, 0); } EXPORT_SYMBOL(cfg80211_ch_switch_notify); +void cfg80211_ch_switch_started_notify(struct net_device *dev, + struct cfg80211_chan_def *chandef, + u8 count) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + + trace_cfg80211_ch_switch_started_notify(dev, chandef); + + nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, + NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, count); +} +EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); + void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer, u32 num_packets, u32 rate, u32 intvl, gfp_t gfp) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 277a85df910e..6e25370d3ce7 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2355,6 +2355,22 @@ TRACE_EVENT(cfg80211_ch_switch_notify, NETDEV_PR_ARG, CHAN_DEF_PR_ARG) ); +TRACE_EVENT(cfg80211_ch_switch_started_notify, + TP_PROTO(struct net_device *netdev, + struct cfg80211_chan_def *chandef), + TP_ARGS(netdev, chandef), + TP_STRUCT__entry( + NETDEV_ENTRY + CHAN_DEF_ENTRY + ), + TP_fast_assign( + NETDEV_ASSIGN; + CHAN_DEF_ASSIGN(chandef); + ), + TP_printk(NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT, + NETDEV_PR_ARG, CHAN_DEF_PR_ARG) +); + TRACE_EVENT(cfg80211_radar_event, TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), TP_ARGS(wiphy, chandef), -- cgit v1.2.3 From d04b5ac9e70b2056a8a12f768f4b46773576025e Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Fri, 7 Nov 2014 14:31:37 +0200 Subject: cfg80211/mac80211: allow any interface to send channel switch notifications For multi-vif channel switches, we want to send NL80211_CMD_CH_SWITCH_NOTIFY to the userspace to let it decide whether other interfaces need to be moved as well. This is needed when we want a P2P GO interface to follow the channel of a station, for example. Modify the code so that all interfaces can send CSA notifications. Additionally, send notifications for STA CSA as well. Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 +++- net/mac80211/mlme.c | 2 ++ net/wireless/nl80211.c | 6 ------ 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 354163433352..a552736c3e59 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -643,7 +643,9 @@ * @NL80211_CMD_CH_SWITCH_NOTIFY: An AP or GO may decide to switch channels * independently of the userspace SME, send this event indicating * %NL80211_ATTR_IFINDEX is now on %NL80211_ATTR_WIPHY_FREQ and the - * attributes determining channel width. + * attributes determining channel width. This indication may also be + * sent when a remotely-initiated switch (e.g., when a STA receives a CSA + * from the remote AP) is completed; * * @NL80211_CMD_CH_SWITCH_STARTED_NOTIFY: Notify that a channel switch * has been started on an interface, regardless of the initiator diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 947250077615..243539878991 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1049,6 +1049,8 @@ static void ieee80211_chswitch_post_beacon(struct ieee80211_sub_if_data *sdata) sdata->csa_block_tx = false; } + cfg80211_ch_switch_notify(sdata->dev, &sdata->reserved_chandef); + sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 24fd2925b281..d0a8361b3395 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11702,12 +11702,6 @@ void cfg80211_ch_switch_notify(struct net_device *dev, trace_cfg80211_ch_switch_notify(dev, chandef); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && - wdev->iftype != NL80211_IFTYPE_P2P_GO && - wdev->iftype != NL80211_IFTYPE_ADHOC && - wdev->iftype != NL80211_IFTYPE_MESH_POINT)) - return; - wdev->chandef = *chandef; wdev->preset_chandef = *chandef; nl80211_ch_switch_notify(rdev, dev, chandef, GFP_KERNEL, -- cgit v1.2.3 From a6d4a534e15f0e1b13b518c31219f9fb7166412a Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Thu, 23 Oct 2014 09:37:33 +0300 Subject: cfg80211: introduce regulatory flags controlling bw Allow setting bandwidth related regulatory flags. These flags are mapped to the corresponding channel flags in the specified range. Make sure the new flags are consulted when calculating the maximum bandwidth allowed by a regulatory-rule. Also allow propagating the GO_CONCURRENT modifier from a reg-rule to a channel. Signed-off-by: Arik Nemtsov Reviewed-by: Luis R. Rodriguez Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 12 ++++++++++++ net/wireless/reg.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a552736c3e59..442369f69b4f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2665,6 +2665,11 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated * base on contiguous rules and wider channels will be allowed to cross * multiple contiguous/overlapping frequency ranges. + * @NL80211_RRF_GO_CONCURRENT: See &NL80211_FREQUENCY_ATTR_GO_CONCURRENT + * @NL80211_RRF_NO_HT40MINUS: channels can't be used in HT40- operation + * @NL80211_RRF_NO_HT40PLUS: channels can't be used in HT40+ operation + * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed + * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -2677,11 +2682,18 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_IR = 1<<7, __NL80211_RRF_NO_IBSS = 1<<8, NL80211_RRF_AUTO_BW = 1<<11, + NL80211_RRF_GO_CONCURRENT = 1<<12, + NL80211_RRF_NO_HT40MINUS = 1<<13, + NL80211_RRF_NO_HT40PLUS = 1<<14, + NL80211_RRF_NO_80MHZ = 1<<15, + NL80211_RRF_NO_160MHZ = 1<<16, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR #define NL80211_RRF_NO_IBSS NL80211_RRF_NO_IR #define NL80211_RRF_NO_IR NL80211_RRF_NO_IR +#define NL80211_RRF_NO_HT40 (NL80211_RRF_NO_HT40MINUS |\ + NL80211_RRF_NO_HT40PLUS) /* For backport compatibility with older userspace */ #define NL80211_RRF_NO_IR_ALL (NL80211_RRF_NO_IR | __NL80211_RRF_NO_IBSS) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b725a31a4751..7449a8c0f9fd 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -573,8 +573,9 @@ static const struct ieee80211_regdomain *reg_get_regdomain(struct wiphy *wiphy) return get_cfg80211_regdom(); } -unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, - const struct ieee80211_reg_rule *rule) +static unsigned int +reg_get_max_bandwidth_from_range(const struct ieee80211_regdomain *rd, + const struct ieee80211_reg_rule *rule) { const struct ieee80211_freq_range *freq_range = &rule->freq_range; const struct ieee80211_freq_range *freq_range_tmp; @@ -622,6 +623,27 @@ unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, return end_freq - start_freq; } +unsigned int reg_get_max_bandwidth(const struct ieee80211_regdomain *rd, + const struct ieee80211_reg_rule *rule) +{ + unsigned int bw = reg_get_max_bandwidth_from_range(rd, rule); + + if (rule->flags & NL80211_RRF_NO_160MHZ) + bw = min_t(unsigned int, bw, MHZ_TO_KHZ(80)); + if (rule->flags & NL80211_RRF_NO_80MHZ) + bw = min_t(unsigned int, bw, MHZ_TO_KHZ(40)); + + /* + * HT40+/HT40- limits are handled per-channel. Only limit BW if both + * are not allowed. + */ + if (rule->flags & NL80211_RRF_NO_HT40MINUS && + rule->flags & NL80211_RRF_NO_HT40PLUS) + bw = min_t(unsigned int, bw, MHZ_TO_KHZ(20)); + + return bw; +} + /* Sanity check on a regulatory rule */ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule) { @@ -946,6 +968,16 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_NO_OFDM; if (rd_flags & NL80211_RRF_NO_OUTDOOR) channel_flags |= IEEE80211_CHAN_INDOOR_ONLY; + if (rd_flags & NL80211_RRF_GO_CONCURRENT) + channel_flags |= IEEE80211_CHAN_GO_CONCURRENT; + if (rd_flags & NL80211_RRF_NO_HT40MINUS) + channel_flags |= IEEE80211_CHAN_NO_HT40MINUS; + if (rd_flags & NL80211_RRF_NO_HT40PLUS) + channel_flags |= IEEE80211_CHAN_NO_HT40PLUS; + if (rd_flags & NL80211_RRF_NO_80MHZ) + channel_flags |= IEEE80211_CHAN_NO_80MHZ; + if (rd_flags & NL80211_RRF_NO_160MHZ) + channel_flags |= IEEE80211_CHAN_NO_160MHZ; return channel_flags; } -- cgit v1.2.3 From be955b2984822e2a66176bccb3e0dbaf9cd569b6 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Thu, 6 Nov 2014 08:10:14 -0800 Subject: rtnetlink: add babel protocol recognition Babel uses rt_proto 42. Add to userspace visible header file. Signed-off-by: Dave Taht Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index eb0f1a554d7b..9c9b8b4480cd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -235,6 +235,7 @@ enum { #define RTPROT_NTK 15 /* Netsukuku */ #define RTPROT_DHCP 16 /* DHCP client */ #define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ /* rtm_scope -- cgit v1.2.3 From 5a10b7dbf904bfe01bb9fcc6298f7df09eed77d5 Mon Sep 17 00:00:00 2001 From: Raushaniya Maksudova Date: Mon, 10 Nov 2014 09:36:29 +1030 Subject: virtio_balloon: free some memory from balloon on OOM Excessive virtio_balloon inflation can cause invocation of OOM-killer, when Linux is under severe memory pressure. Various mechanisms are responsible for correct virtio_balloon memory management. Nevertheless it is often the case that these control tools does not have enough time to react on fast changing memory load. As a result OS runs out of memory and invokes OOM-killer. The balancing of memory by use of the virtio balloon should not cause the termination of processes while there are pages in the balloon. Now there is no way for virtio balloon driver to free some memory at the last moment before some process will be get killed by OOM-killer. This does not provide a security breach as balloon itself is running inside guest OS and is working in the cooperation with the host. Thus some improvements from guest side should be considered as normal. To solve the problem, introduce a virtio_balloon callback which is expected to be called from the oom notifier call chain in out_of_memory() function. If virtio balloon could release some memory, it will make the system to return and retry the allocation that forced the out of memory killer to run. Allocate virtio feature bit for this: it is not set by default, the the guest will not deflate virtio balloon on OOM without explicit permission from host. Signed-off-by: Raushaniya Maksudova Signed-off-by: Denis V. Lunev Acked-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 52 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_balloon.h | 1 + 2 files changed, 53 insertions(+) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index ff71fdcf3345..50c5f42d7a9f 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * Balloon device works in 4K page units. So each page is pointed to by @@ -36,6 +37,12 @@ */ #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 +#define OOM_VBALLOON_DEFAULT_PAGES 256 +#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80 + +static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; +module_param(oom_pages, int, S_IRUSR | S_IWUSR); +MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); struct virtio_balloon { @@ -71,6 +78,9 @@ struct virtio_balloon /* Memory statistics */ int need_stats_update; struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; + + /* To register callback in oom notifier call chain */ + struct notifier_block nb; }; static struct virtio_device_id id_table[] = { @@ -290,6 +300,38 @@ static void update_balloon_size(struct virtio_balloon *vb) &actual); } +/* + * virtballoon_oom_notify - release pages when system is under severe + * memory pressure (called from out_of_memory()) + * @self : notifier block struct + * @dummy: not used + * @parm : returned - number of freed pages + * + * The balancing of memory by use of the virtio balloon should not cause + * the termination of processes while there are pages in the balloon. + * If virtio balloon manages to release some memory, it will make the + * system return and retry the allocation that forced the OOM killer + * to run. + */ +static int virtballoon_oom_notify(struct notifier_block *self, + unsigned long dummy, void *parm) +{ + struct virtio_balloon *vb; + unsigned long *freed; + unsigned num_freed_pages; + + vb = container_of(self, struct virtio_balloon, nb); + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) + return NOTIFY_OK; + + freed = parm; + num_freed_pages = leak_balloon(vb, oom_pages); + update_balloon_size(vb); + *freed += num_freed_pages; + + return NOTIFY_OK; +} + static int balloon(void *_vballoon) { struct virtio_balloon *vb = _vballoon; @@ -446,6 +488,12 @@ static int virtballoon_probe(struct virtio_device *vdev) if (err) goto out_free_vb; + vb->nb.notifier_call = virtballoon_oom_notify; + vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; + err = register_oom_notifier(&vb->nb); + if (err < 0) + goto out_oom_notify; + vb->thread = kthread_run(balloon, vb, "vballoon"); if (IS_ERR(vb->thread)) { err = PTR_ERR(vb->thread); @@ -455,6 +503,8 @@ static int virtballoon_probe(struct virtio_device *vdev) return 0; out_del_vqs: + unregister_oom_notifier(&vb->nb); +out_oom_notify: vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); @@ -479,6 +529,7 @@ static void virtballoon_remove(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; + unregister_oom_notifier(&vb->nb); kthread_stop(vb->thread); remove_common(vb); kfree(vb); @@ -518,6 +569,7 @@ static int virtballoon_restore(struct virtio_device *vdev) static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, + VIRTIO_BALLOON_F_DEFLATE_ON_OOM, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 5e26f61b5df5..be40f7059e93 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -31,6 +31,7 @@ /* The feature bitmap for virtio balloon */ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ +#define VIRTIO_BALLOON_F_DEFLATE_ON_OOM 2 /* Deflate balloon on OOM */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 -- cgit v1.2.3 From ddcecf6b6ae7b91c8735e52f50cd403ee9cbe298 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 10 Nov 2014 17:24:26 +0100 Subject: ALSA: Fix invalid kerneldoc markers They are no real kerneldoc comments, so drop such markers. Signed-off-by: Takashi Iwai --- include/uapi/sound/hdspm.h | 12 +++++------ sound/pci/emu10k1/emu10k1x.c | 2 +- sound/pci/lx6464es/lx_defs.h | 2 +- sound/pci/rme9652/hdspm.c | 48 ++++++++++++++++++++++---------------------- 4 files changed, 32 insertions(+), 32 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/sound/hdspm.h b/include/uapi/sound/hdspm.h index d956c3593f65..b357f1a5e29c 100644 --- a/include/uapi/sound/hdspm.h +++ b/include/uapi/sound/hdspm.h @@ -74,14 +74,14 @@ struct hdspm_config { #define SNDRV_HDSPM_IOCTL_GET_CONFIG \ _IOR('H', 0x41, struct hdspm_config) -/** +/* * If there's a TCO (TimeCode Option) board installed, * there are further options and status data available. * The hdspm_ltc structure contains the current SMPTE * timecode and some status information and can be * obtained via SNDRV_HDSPM_IOCTL_GET_LTC or in the * hdspm_status struct. - **/ + */ enum hdspm_ltc_format { format_invalid, @@ -113,11 +113,11 @@ struct hdspm_ltc { #define SNDRV_HDSPM_IOCTL_GET_LTC _IOR('H', 0x46, struct hdspm_ltc) -/** +/* * The status data reflects the device's current state * as determined by the card's configuration and * connection status. - **/ + */ enum hdspm_sync { hdspm_sync_no_lock = 0, @@ -171,9 +171,9 @@ struct hdspm_status { #define SNDRV_HDSPM_IOCTL_GET_STATUS \ _IOR('H', 0x47, struct hdspm_status) -/** +/* * Get information about the card and its add-ons. - **/ + */ #define HDSPM_ADDON_TCO 1 diff --git a/sound/pci/emu10k1/emu10k1x.c b/sound/pci/emu10k1/emu10k1x.c index e223de1408c0..15933f92f63a 100644 --- a/sound/pci/emu10k1/emu10k1x.c +++ b/sound/pci/emu10k1/emu10k1x.c @@ -180,7 +180,7 @@ MODULE_PARM_DESC(enable, "Enable the EMU10K1X soundcard."); /* From 0x50 - 0x5f, last samples captured */ -/** +/* * The hardware has 3 channels for playback and 1 for capture. * - channel 0 is the front channel * - channel 1 is the rear channel diff --git a/sound/pci/lx6464es/lx_defs.h b/sound/pci/lx6464es/lx_defs.h index 49d36bdd512c..469bcc685edf 100644 --- a/sound/pci/lx6464es/lx_defs.h +++ b/sound/pci/lx6464es/lx_defs.h @@ -175,7 +175,7 @@ enum buffer_flags { BF_ZERO = 0x00, /* no flags (init).*/ }; -/** +/* * Stream Flags definitions */ enum stream_flags { diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c index e09348c156d8..3342705a5715 100644 --- a/sound/pci/rme9652/hdspm.c +++ b/sound/pci/rme9652/hdspm.c @@ -2201,10 +2201,10 @@ static inline int hdspm_get_pll_freq(struct hdspm *hdspm) return rate; } -/** +/* * Calculate the real sample rate from the * current DDS value. - **/ + */ static int hdspm_get_system_sample_rate(struct hdspm *hdspm) { unsigned int rate; @@ -2270,9 +2270,9 @@ static int snd_hdspm_put_system_sample_rate(struct snd_kcontrol *kcontrol, } -/** +/* * Returns the WordClock sample rate class for the given card. - **/ + */ static int hdspm_get_wc_sample_rate(struct hdspm *hdspm) { int status; @@ -2295,9 +2295,9 @@ static int hdspm_get_wc_sample_rate(struct hdspm *hdspm) } -/** +/* * Returns the TCO sample rate class for the given card. - **/ + */ static int hdspm_get_tco_sample_rate(struct hdspm *hdspm) { int status; @@ -2321,9 +2321,9 @@ static int hdspm_get_tco_sample_rate(struct hdspm *hdspm) } -/** +/* * Returns the SYNC_IN sample rate class for the given card. - **/ + */ static int hdspm_get_sync_in_sample_rate(struct hdspm *hdspm) { int status; @@ -2343,9 +2343,9 @@ static int hdspm_get_sync_in_sample_rate(struct hdspm *hdspm) return 0; } -/** +/* * Returns the AES sample rate class for the given card. - **/ + */ static int hdspm_get_aes_sample_rate(struct hdspm *hdspm, int index) { int timecode; @@ -2361,10 +2361,10 @@ static int hdspm_get_aes_sample_rate(struct hdspm *hdspm, int index) return 0; } -/** +/* * Returns the sample rate class for input source for * 'new style' cards like the AIO and RayDAT. - **/ + */ static int hdspm_get_s1_sample_rate(struct hdspm *hdspm, unsigned int idx) { int status = hdspm_read(hdspm, HDSPM_RD_STATUS_2); @@ -2512,10 +2512,10 @@ static int snd_hdspm_get_autosync_sample_rate(struct snd_kcontrol *kcontrol, } -/** +/* * Returns the system clock mode for the given card. * @returns 0 - master, 1 - slave - **/ + */ static int hdspm_system_clock_mode(struct hdspm *hdspm) { switch (hdspm->io_type) { @@ -2534,10 +2534,10 @@ static int hdspm_system_clock_mode(struct hdspm *hdspm) } -/** +/* * Sets the system clock mode. * @param mode 0 - master, 1 - slave - **/ + */ static void hdspm_set_system_clock_mode(struct hdspm *hdspm, int mode) { hdspm_set_toggle_setting(hdspm, @@ -2692,11 +2692,11 @@ static int snd_hdspm_put_clock_source(struct snd_kcontrol *kcontrol, } -/** +/* * Returns the current preferred sync reference setting. * The semantics of the return value are depending on the * card, please see the comments for clarification. - **/ + */ static int hdspm_pref_sync_ref(struct hdspm * hdspm) { switch (hdspm->io_type) { @@ -2795,11 +2795,11 @@ static int hdspm_pref_sync_ref(struct hdspm * hdspm) } -/** +/* * Set the preferred sync reference to . The semantics * of are depending on the card type, see the comments * for clarification. - **/ + */ static int hdspm_set_pref_sync_ref(struct hdspm * hdspm, int pref) { int p = 0; @@ -4101,9 +4101,9 @@ static int snd_hdspm_get_sync_check(struct snd_kcontrol *kcontrol, -/** +/* * TCO controls - **/ + */ static void hdspm_tco_write(struct hdspm *hdspm) { unsigned int tc[4] = { 0, 0, 0, 0}; @@ -5403,7 +5403,7 @@ static irqreturn_t snd_hdspm_interrupt(int irq, void *dev_id) HDSPM_midi2IRQPending | HDSPM_midi3IRQPending); /* now = get_cycles(); */ - /** + /* * LAT_2..LAT_0 period counter (win) counter (mac) * 6 4096 ~256053425 ~514672358 * 5 2048 ~128024983 ~257373821 @@ -5412,7 +5412,7 @@ static irqreturn_t snd_hdspm_interrupt(int irq, void *dev_id) * 2 256 ~16003039 ~32260176 * 1 128 ~7998738 ~16194507 * 0 64 ~3998231 ~8191558 - **/ + */ /* dev_info(hdspm->card->dev, "snd_hdspm_interrupt %llu @ %llx\n", now-hdspm->last_interrupt, status & 0xFFC0); -- cgit v1.2.3 From 2c8c56e15df3d4c2af3d656e44feb18789f75837 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Nov 2014 05:54:28 -0800 Subject: net: introduce SO_INCOMING_CPU Alternative to RPS/RFS is to use hardware support for multiple queues. Then split a set of million of sockets into worker threads, each one using epoll() to manage events on its own socket pool. Ideally, we want one thread per RX/TX queue/cpu, but we have no way to know after accept() or connect() on which queue/cpu a socket is managed. We normally use one cpu per RX queue (IRQ smp_affinity being properly set), so remembering on socket structure which cpu delivered last packet is enough to solve the problem. After accept(), connect(), or even file descriptor passing around processes, applications can use : int cpu; socklen_t len = sizeof(cpu); getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len); And use this information to put the socket into the right silo for optimal performance, as all networking stack should run on the appropriate cpu, without need to send IPI (RPS/RFS). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/avr32/include/uapi/asm/socket.h | 2 ++ arch/cris/include/uapi/asm/socket.h | 2 ++ arch/frv/include/uapi/asm/socket.h | 2 ++ arch/ia64/include/uapi/asm/socket.h | 2 ++ arch/m32r/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/mn10300/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/powerpc/include/uapi/asm/socket.h | 2 ++ arch/s390/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ arch/xtensa/include/uapi/asm/socket.h | 2 ++ include/net/sock.h | 12 ++++++++++++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 5 +++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/udp.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/ipv6/udp.c | 1 + net/sctp/ulpqueue.c | 5 +++-- 21 files changed, 52 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 3de1394bcab8..e2fe0700b3b4 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -87,4 +87,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 6e6cd159924b..92121b0f5b98 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index ed94e5ed0a23..60f60f5b9b35 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -82,6 +82,8 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index ca2c6e6f31c6..2c6890209ea6 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -80,5 +80,7 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index a1b49bac7951..09a93fb566f6 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -89,4 +89,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 6c9a24b3aefa..e8589819c274 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index a14baa218c76..2e9ee8c55a10 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -98,4 +98,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 6aa3ce1854aa..f3492e8c9f70 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -80,4 +80,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index fe35ceacf0e7..7984a1cab3da 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -79,4 +79,6 @@ #define SO_BPF_EXTENSIONS 0x4029 +#define SO_INCOMING_CPU 0x402A + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index a9c3e2e18c05..3474e4ef166d 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -87,4 +87,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index e031332096d7..8457636c33e1 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -86,4 +86,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 54d9608681b6..4a8003a94163 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -76,6 +76,8 @@ #define SO_BPF_EXTENSIONS 0x0032 +#define SO_INCOMING_CPU 0x0033 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 39acec0cf0b1..c46f6a696849 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -91,4 +91,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 6767d75ecb17..7789b59c0c40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -273,6 +273,7 @@ struct cg_proto; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_rxhash: flow hash received from netif layer + * @sk_incoming_cpu: record cpu processing incoming packets * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_protinfo: private area, net family specific, when not using slab @@ -350,6 +351,12 @@ struct sock { #ifdef CONFIG_RPS __u32 sk_rxhash; #endif + u16 sk_incoming_cpu; + /* 16bit hole + * Warned : sk_incoming_cpu can be set from softirq, + * Do not use this hole without fully understanding possible issues. + */ + __u32 sk_txhash; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_napi_id; @@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) return sk->sk_backlog_rcv(sk, skb); } +static inline void sk_incoming_cpu_update(struct sock *sk) +{ + sk->sk_incoming_cpu = raw_smp_processor_id(); +} + static inline void sock_rps_record_flow_hash(__u32 hash) { #ifdef CONFIG_RPS diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index ea0796bdcf88..f541ccefd4ac 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -82,4 +82,6 @@ #define SO_BPF_EXTENSIONS 48 +#define SO_INCOMING_CPU 49 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/sock.c b/net/core/sock.c index ac56dd06c306..0725cf0cb685 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_max_pacing_rate; break; + case SO_INCOMING_CPU: + v.val = sk->sk_incoming_cpu; + break; + default: return -ENOPROTOOPT; } @@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + newsk->sk_incoming_cpu = raw_smp_processor_id(); /* * Before updating sk_refcnt, we must commit prior changes to memory * (Documentation/RCU/rculist_nulls.txt for details) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8893598a4124..2c6a955fd5c3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1663,6 +1663,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_incoming_cpu_update(sk); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5d0fdca8e965..d13751685f44 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); } rc = sock_queue_rcv_skb(sk, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fd8e50b380e7..1985b4933a6b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1456,6 +1456,7 @@ process: if (sk_filter(sk, skb)) goto discard_and_relse; + sk_incoming_cpu_update(sk); skb->dev = NULL; bh_lock_sock_nested(sk); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b756355e9739..d1fe36274906 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (!ipv6_addr_any(&sk->sk_v6_daddr)) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); + sk_incoming_cpu_update(sk); } rc = sock_queue_rcv_skb(sk, skb); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index d49dc2ed30ad..ce469d648ffb 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) goto out_free; - if (!sctp_ulpevent_is_notification(event)) + if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); - + sk_incoming_cpu_update(sk); + } /* Check if the user wishes to receive this event. */ if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) goto out_free; -- cgit v1.2.3 From 6a2c4232ece145d8b5a8f95f767bd6d0d2d2f2bb Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 4 Nov 2014 04:51:40 -0800 Subject: drm/i915: Make the physical object coherent with GTT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently objects for which the hardware needs a contiguous physical address are allocated a shadow backing storage to satisfy the contraint. This shadow buffer is not wired into the normal obj->pages and so the physical object is incoherent with accesses via the GPU, GTT and CPU. By setting up the appropriate scatter-gather table, we can allow userspace to access the physical object via either a GTT mmaping of or by rendering into the GEM bo. However, keeping the CPU mmap of the shmemfs backing storage coherent with the contiguous shadow is not yet possible. Fortuituously, CPU mmaps of objects requiring physical addresses are not expected to be coherent anyway. This allows the physical constraint of the GEM object to be transparent to userspace and allow it to efficiently render into or update them via the GTT and GPU. v2: Fix leak of pci handle spotted by Ville v3: Remove the now duplicate call to detach_phys_object during free. v4: Wait for rendering before pwrite. As this patch makes it possible to render into the phys object, we should make it correct as well! Signed-off-by: Chris Wilson Cc: Ville Syrjälä Reviewed-by: Ville Syrjälä Signed-off-by: Rodrigo Vivi Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/i915_dma.c | 3 + drivers/gpu/drm/i915/i915_drv.h | 6 +- drivers/gpu/drm/i915/i915_gem.c | 207 +++++++++++++++++++++++++++------------- include/uapi/drm/i915_drm.h | 1 + 4 files changed, 150 insertions(+), 67 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 9a7353302b3f..5dc37f0233b2 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1027,6 +1027,9 @@ static int i915_getparam(struct drm_device *dev, void *data, case I915_PARAM_CMD_PARSER_VERSION: value = i915_cmd_parser_get_version(); break; + case I915_PARAM_HAS_COHERENT_PHYS_GTT: + value = 1; + break; default: DRM_DEBUG("Unknown parameter %d\n", param->param); return -EINVAL; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f830596faa9e..3df9ef32d011 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1957,10 +1957,10 @@ struct drm_i915_gem_object { unsigned long user_pin_count; struct drm_file *pin_filp; - /** for phy allocated objects */ - struct drm_dma_handle *phys_handle; - union { + /** for phy allocated objects */ + struct drm_dma_handle *phys_handle; + struct i915_gem_userptr { uintptr_t ptr; unsigned read_only :1; diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 3e0cabe9b544..86cf428b6c4e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -208,40 +208,137 @@ i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data, return 0; } -static void i915_gem_object_detach_phys(struct drm_i915_gem_object *obj) +static int +i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj) { - drm_dma_handle_t *phys = obj->phys_handle; + struct address_space *mapping = file_inode(obj->base.filp)->i_mapping; + char *vaddr = obj->phys_handle->vaddr; + struct sg_table *st; + struct scatterlist *sg; + int i; - if (!phys) - return; + if (WARN_ON(i915_gem_object_needs_bit17_swizzle(obj))) + return -EINVAL; + + for (i = 0; i < obj->base.size / PAGE_SIZE; i++) { + struct page *page; + char *src; + + page = shmem_read_mapping_page(mapping, i); + if (IS_ERR(page)) + return PTR_ERR(page); + + src = kmap_atomic(page); + memcpy(vaddr, src, PAGE_SIZE); + drm_clflush_virt_range(vaddr, PAGE_SIZE); + kunmap_atomic(src); + + page_cache_release(page); + vaddr += PAGE_SIZE; + } + + i915_gem_chipset_flush(obj->base.dev); + + st = kmalloc(sizeof(*st), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + + if (sg_alloc_table(st, 1, GFP_KERNEL)) { + kfree(st); + return -ENOMEM; + } + + sg = st->sgl; + sg->offset = 0; + sg->length = obj->base.size; - if (obj->madv == I915_MADV_WILLNEED) { + sg_dma_address(sg) = obj->phys_handle->busaddr; + sg_dma_len(sg) = obj->base.size; + + obj->pages = st; + obj->has_dma_mapping = true; + return 0; +} + +static void +i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj) +{ + int ret; + + BUG_ON(obj->madv == __I915_MADV_PURGED); + + ret = i915_gem_object_set_to_cpu_domain(obj, true); + if (ret) { + /* In the event of a disaster, abandon all caches and + * hope for the best. + */ + WARN_ON(ret != -EIO); + obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU; + } + + if (obj->madv == I915_MADV_DONTNEED) + obj->dirty = 0; + + if (obj->dirty) { struct address_space *mapping = file_inode(obj->base.filp)->i_mapping; - char *vaddr = phys->vaddr; + char *vaddr = obj->phys_handle->vaddr; int i; for (i = 0; i < obj->base.size / PAGE_SIZE; i++) { - struct page *page = shmem_read_mapping_page(mapping, i); - if (!IS_ERR(page)) { - char *dst = kmap_atomic(page); - memcpy(dst, vaddr, PAGE_SIZE); - drm_clflush_virt_range(dst, PAGE_SIZE); - kunmap_atomic(dst); - - set_page_dirty(page); + struct page *page; + char *dst; + + page = shmem_read_mapping_page(mapping, i); + if (IS_ERR(page)) + continue; + + dst = kmap_atomic(page); + drm_clflush_virt_range(vaddr, PAGE_SIZE); + memcpy(dst, vaddr, PAGE_SIZE); + kunmap_atomic(dst); + + set_page_dirty(page); + if (obj->madv == I915_MADV_WILLNEED) mark_page_accessed(page); - page_cache_release(page); - } + page_cache_release(page); vaddr += PAGE_SIZE; } - i915_gem_chipset_flush(obj->base.dev); + obj->dirty = 0; } -#ifdef CONFIG_X86 - set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE); -#endif - drm_pci_free(obj->base.dev, phys); - obj->phys_handle = NULL; + sg_free_table(obj->pages); + kfree(obj->pages); + + obj->has_dma_mapping = false; +} + +static void +i915_gem_object_release_phys(struct drm_i915_gem_object *obj) +{ + drm_pci_free(obj->base.dev, obj->phys_handle); +} + +static const struct drm_i915_gem_object_ops i915_gem_phys_ops = { + .get_pages = i915_gem_object_get_pages_phys, + .put_pages = i915_gem_object_put_pages_phys, + .release = i915_gem_object_release_phys, +}; + +static int +drop_pages(struct drm_i915_gem_object *obj) +{ + struct i915_vma *vma, *next; + int ret; + + drm_gem_object_reference(&obj->base); + list_for_each_entry_safe(vma, next, &obj->vma_list, vma_link) + if (i915_vma_unbind(vma)) + break; + + ret = i915_gem_object_put_pages(obj); + drm_gem_object_unreference(&obj->base); + + return ret; } int @@ -249,9 +346,7 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, int align) { drm_dma_handle_t *phys; - struct address_space *mapping; - char *vaddr; - int i; + int ret; if (obj->phys_handle) { if ((unsigned long)obj->phys_handle->vaddr & (align -1)) @@ -266,41 +361,19 @@ i915_gem_object_attach_phys(struct drm_i915_gem_object *obj, if (obj->base.filp == NULL) return -EINVAL; + ret = drop_pages(obj); + if (ret) + return ret; + /* create a new object */ phys = drm_pci_alloc(obj->base.dev, obj->base.size, align); if (!phys) return -ENOMEM; - vaddr = phys->vaddr; -#ifdef CONFIG_X86 - set_memory_wc((unsigned long)vaddr, phys->size / PAGE_SIZE); -#endif - mapping = file_inode(obj->base.filp)->i_mapping; - for (i = 0; i < obj->base.size / PAGE_SIZE; i++) { - struct page *page; - char *src; - - page = shmem_read_mapping_page(mapping, i); - if (IS_ERR(page)) { -#ifdef CONFIG_X86 - set_memory_wb((unsigned long)phys->vaddr, phys->size / PAGE_SIZE); -#endif - drm_pci_free(obj->base.dev, phys); - return PTR_ERR(page); - } - - src = kmap_atomic(page); - memcpy(vaddr, src, PAGE_SIZE); - kunmap_atomic(src); - - mark_page_accessed(page); - page_cache_release(page); - - vaddr += PAGE_SIZE; - } - obj->phys_handle = phys; - return 0; + obj->ops = &i915_gem_phys_ops; + + return i915_gem_object_get_pages(obj); } static int @@ -311,6 +384,14 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, struct drm_device *dev = obj->base.dev; void *vaddr = obj->phys_handle->vaddr + args->offset; char __user *user_data = to_user_ptr(args->data_ptr); + int ret; + + /* We manually control the domain here and pretend that it + * remains coherent i.e. in the GTT domain, like shmem_pwrite. + */ + ret = i915_gem_object_wait_rendering(obj, false); + if (ret) + return ret; if (__copy_from_user_inatomic_nocache(vaddr, user_data, args->size)) { unsigned long unwritten; @@ -326,6 +407,7 @@ i915_gem_phys_pwrite(struct drm_i915_gem_object *obj, return -EFAULT; } + drm_clflush_virt_range(vaddr, args->size); i915_gem_chipset_flush(dev); return 0; } @@ -1046,11 +1128,6 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, * pread/pwrite currently are reading and writing from the CPU * perspective, requiring manual detiling by the client. */ - if (obj->phys_handle) { - ret = i915_gem_phys_pwrite(obj, args, file); - goto out; - } - if (obj->tiling_mode == I915_TILING_NONE && obj->base.write_domain != I915_GEM_DOMAIN_CPU && cpu_write_needs_clflush(obj)) { @@ -1060,8 +1137,12 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, * textures). Fallback to the shmem path in that case. */ } - if (ret == -EFAULT || ret == -ENOSPC) - ret = i915_gem_shmem_pwrite(dev, obj, args, file); + if (ret == -EFAULT || ret == -ENOSPC) { + if (obj->phys_handle) + ret = i915_gem_phys_pwrite(obj, args, file); + else + ret = i915_gem_shmem_pwrite(dev, obj, args, file); + } out: drm_gem_object_unreference(&obj->base); @@ -3509,7 +3590,7 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj, * Stolen memory is always coherent with the GPU as it is explicitly * marked as wc by the system, or the system is cache-coherent. */ - if (obj->stolen) + if (obj->stolen || obj->phys_handle) return false; /* If the GPU is snooping the contents of the CPU cache, @@ -4471,8 +4552,6 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj) } } - i915_gem_object_detach_phys(obj); - /* Stolen objects don't hold a ref, but do hold pin count. Fix that up * before progressing. */ if (obj->stolen) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 2ec0efcaa719..250262265ee3 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -340,6 +340,7 @@ typedef struct drm_i915_irq_wait { #define I915_PARAM_HAS_EXEC_HANDLE_LUT 26 #define I915_PARAM_HAS_WT 27 #define I915_PARAM_CMD_PARSER_VERSION 28 +#define I915_PARAM_HAS_COHERENT_PHYS_GTT 29 typedef struct drm_i915_getparam { int param; -- cgit v1.2.3 From edcf58bc031e621f519c9dfce3c7e1ea6880c70a Mon Sep 17 00:00:00 2001 From: Boris BREZILLON Date: Mon, 10 Nov 2014 14:28:26 -0300 Subject: [media] Move mediabus format definition to a more standard place Define MEDIA_BUS_FMT macros (re-using the values defined in the v4l2_mbus_pixelcode enum) into a separate header file so that they can be used from the DRM/KMS subsystem without any reference to the V4L2 subsystem. Then set V4L2_MBUS_FMT definitions to the MEDIA_BUS_FMT values using the V4L2_MBUS_FROM_MEDIA_BUS_FMT macro. Signed-off-by: Boris Brezillon Acked-by: Guennadi Liakhovetski Acked-by: Hans Verkuil Acked-by: Sakari Ailus Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/media-bus-format.h | 125 +++++++++++++++++++++++ include/uapi/linux/v4l2-mediabus.h | 184 +++++++++++++++------------------- 3 files changed, 206 insertions(+), 104 deletions(-) create mode 100644 include/uapi/linux/media-bus-format.h (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index b70237e8bc37..ed39ac84c0f4 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -241,6 +241,7 @@ header-y += map_to_7segment.h header-y += matroxfb.h header-y += mdio.h header-y += media.h +header-y += media-bus-format.h header-y += mei.h header-y += memfd.h header-y += mempolicy.h diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h new file mode 100644 index 000000000000..23b40908be30 --- /dev/null +++ b/include/uapi/linux/media-bus-format.h @@ -0,0 +1,125 @@ +/* + * Media Bus API header + * + * Copyright (C) 2009, Guennadi Liakhovetski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_MEDIA_BUS_FORMAT_H +#define __LINUX_MEDIA_BUS_FORMAT_H + +/* + * These bus formats uniquely identify data formats on the data bus. Format 0 + * is reserved, MEDIA_BUS_FMT_FIXED shall be used by host-client pairs, where + * the data format is fixed. Additionally, "2X8" means that one pixel is + * transferred in two 8-bit samples, "BE" or "LE" specify in which order those + * samples are transferred over the bus: "LE" means that the least significant + * bits are transferred first, "BE" means that the most significant bits are + * transferred first, and "PADHI" and "PADLO" define which bits - low or high, + * in the incomplete high byte, are filled with padding bits. + * + * The bus formats are grouped by type, bus_width, bits per component, samples + * per pixel and order of subsamples. Numerical values are sorted using generic + * numerical sort order (8 thus comes before 10). + * + * As their value can't change when a new bus format is inserted in the + * enumeration, the bus formats are explicitly given a numerical value. The next + * free values for each category are listed below, update them when inserting + * new pixel codes. + */ + +#define MEDIA_BUS_FMT_FIXED 0x0001 + +/* RGB - next is 0x100e */ +#define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 +#define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 +#define MEDIA_BUS_FMT_RGB555_2X8_PADHI_BE 0x1003 +#define MEDIA_BUS_FMT_RGB555_2X8_PADHI_LE 0x1004 +#define MEDIA_BUS_FMT_BGR565_2X8_BE 0x1005 +#define MEDIA_BUS_FMT_BGR565_2X8_LE 0x1006 +#define MEDIA_BUS_FMT_RGB565_2X8_BE 0x1007 +#define MEDIA_BUS_FMT_RGB565_2X8_LE 0x1008 +#define MEDIA_BUS_FMT_RGB666_1X18 0x1009 +#define MEDIA_BUS_FMT_RGB888_1X24 0x100a +#define MEDIA_BUS_FMT_RGB888_2X12_BE 0x100b +#define MEDIA_BUS_FMT_RGB888_2X12_LE 0x100c +#define MEDIA_BUS_FMT_ARGB8888_1X32 0x100d + +/* YUV (including grey) - next is 0x2024 */ +#define MEDIA_BUS_FMT_Y8_1X8 0x2001 +#define MEDIA_BUS_FMT_UV8_1X8 0x2015 +#define MEDIA_BUS_FMT_UYVY8_1_5X8 0x2002 +#define MEDIA_BUS_FMT_VYUY8_1_5X8 0x2003 +#define MEDIA_BUS_FMT_YUYV8_1_5X8 0x2004 +#define MEDIA_BUS_FMT_YVYU8_1_5X8 0x2005 +#define MEDIA_BUS_FMT_UYVY8_2X8 0x2006 +#define MEDIA_BUS_FMT_VYUY8_2X8 0x2007 +#define MEDIA_BUS_FMT_YUYV8_2X8 0x2008 +#define MEDIA_BUS_FMT_YVYU8_2X8 0x2009 +#define MEDIA_BUS_FMT_Y10_1X10 0x200a +#define MEDIA_BUS_FMT_UYVY10_2X10 0x2018 +#define MEDIA_BUS_FMT_VYUY10_2X10 0x2019 +#define MEDIA_BUS_FMT_YUYV10_2X10 0x200b +#define MEDIA_BUS_FMT_YVYU10_2X10 0x200c +#define MEDIA_BUS_FMT_Y12_1X12 0x2013 +#define MEDIA_BUS_FMT_UYVY8_1X16 0x200f +#define MEDIA_BUS_FMT_VYUY8_1X16 0x2010 +#define MEDIA_BUS_FMT_YUYV8_1X16 0x2011 +#define MEDIA_BUS_FMT_YVYU8_1X16 0x2012 +#define MEDIA_BUS_FMT_YDYUYDYV8_1X16 0x2014 +#define MEDIA_BUS_FMT_UYVY10_1X20 0x201a +#define MEDIA_BUS_FMT_VYUY10_1X20 0x201b +#define MEDIA_BUS_FMT_YUYV10_1X20 0x200d +#define MEDIA_BUS_FMT_YVYU10_1X20 0x200e +#define MEDIA_BUS_FMT_YUV10_1X30 0x2016 +#define MEDIA_BUS_FMT_AYUV8_1X32 0x2017 +#define MEDIA_BUS_FMT_UYVY12_2X12 0x201c +#define MEDIA_BUS_FMT_VYUY12_2X12 0x201d +#define MEDIA_BUS_FMT_YUYV12_2X12 0x201e +#define MEDIA_BUS_FMT_YVYU12_2X12 0x201f +#define MEDIA_BUS_FMT_UYVY12_1X24 0x2020 +#define MEDIA_BUS_FMT_VYUY12_1X24 0x2021 +#define MEDIA_BUS_FMT_YUYV12_1X24 0x2022 +#define MEDIA_BUS_FMT_YVYU12_1X24 0x2023 + +/* Bayer - next is 0x3019 */ +#define MEDIA_BUS_FMT_SBGGR8_1X8 0x3001 +#define MEDIA_BUS_FMT_SGBRG8_1X8 0x3013 +#define MEDIA_BUS_FMT_SGRBG8_1X8 0x3002 +#define MEDIA_BUS_FMT_SRGGB8_1X8 0x3014 +#define MEDIA_BUS_FMT_SBGGR10_ALAW8_1X8 0x3015 +#define MEDIA_BUS_FMT_SGBRG10_ALAW8_1X8 0x3016 +#define MEDIA_BUS_FMT_SGRBG10_ALAW8_1X8 0x3017 +#define MEDIA_BUS_FMT_SRGGB10_ALAW8_1X8 0x3018 +#define MEDIA_BUS_FMT_SBGGR10_DPCM8_1X8 0x300b +#define MEDIA_BUS_FMT_SGBRG10_DPCM8_1X8 0x300c +#define MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8 0x3009 +#define MEDIA_BUS_FMT_SRGGB10_DPCM8_1X8 0x300d +#define MEDIA_BUS_FMT_SBGGR10_2X8_PADHI_BE 0x3003 +#define MEDIA_BUS_FMT_SBGGR10_2X8_PADHI_LE 0x3004 +#define MEDIA_BUS_FMT_SBGGR10_2X8_PADLO_BE 0x3005 +#define MEDIA_BUS_FMT_SBGGR10_2X8_PADLO_LE 0x3006 +#define MEDIA_BUS_FMT_SBGGR10_1X10 0x3007 +#define MEDIA_BUS_FMT_SGBRG10_1X10 0x300e +#define MEDIA_BUS_FMT_SGRBG10_1X10 0x300a +#define MEDIA_BUS_FMT_SRGGB10_1X10 0x300f +#define MEDIA_BUS_FMT_SBGGR12_1X12 0x3008 +#define MEDIA_BUS_FMT_SGBRG12_1X12 0x3010 +#define MEDIA_BUS_FMT_SGRBG12_1X12 0x3011 +#define MEDIA_BUS_FMT_SRGGB12_1X12 0x3012 + +/* JPEG compressed formats - next is 0x4002 */ +#define MEDIA_BUS_FMT_JPEG_1X8 0x4001 + +/* Vendor specific formats - next is 0x5002 */ + +/* S5C73M3 sensor specific interleaved UYVY and JPEG */ +#define MEDIA_BUS_FMT_S5C_UYVY_JPEG_1X8 0x5001 + +/* HSV - next is 0x6002 */ +#define MEDIA_BUS_FMT_AHSV8888_1X32 0x6001 + +#endif /* __LINUX_MEDIA_BUS_FORMAT_H */ diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h index 1445e858854f..26180844c6b8 100644 --- a/include/uapi/linux/v4l2-mediabus.h +++ b/include/uapi/linux/v4l2-mediabus.h @@ -11,120 +11,96 @@ #ifndef __LINUX_V4L2_MEDIABUS_H #define __LINUX_V4L2_MEDIABUS_H +#include #include #include -/* - * These pixel codes uniquely identify data formats on the media bus. Mostly - * they correspond to similarly named V4L2_PIX_FMT_* formats, format 0 is - * reserved, V4L2_MBUS_FMT_FIXED shall be used by host-client pairs, where the - * data format is fixed. Additionally, "2X8" means that one pixel is transferred - * in two 8-bit samples, "BE" or "LE" specify in which order those samples are - * transferred over the bus: "LE" means that the least significant bits are - * transferred first, "BE" means that the most significant bits are transferred - * first, and "PADHI" and "PADLO" define which bits - low or high, in the - * incomplete high byte, are filled with padding bits. - * - * The pixel codes are grouped by type, bus_width, bits per component, samples - * per pixel and order of subsamples. Numerical values are sorted using generic - * numerical sort order (8 thus comes before 10). - * - * As their value can't change when a new pixel code is inserted in the - * enumeration, the pixel codes are explicitly given a numerical value. The next - * free values for each category are listed below, update them when inserting - * new pixel codes. - */ -enum v4l2_mbus_pixelcode { - V4L2_MBUS_FMT_FIXED = 0x0001, +#define V4L2_MBUS_FROM_MEDIA_BUS_FMT(name) \ + V4L2_MBUS_FMT_ ## name = MEDIA_BUS_FMT_ ## name - /* RGB - next is 0x100e */ - V4L2_MBUS_FMT_RGB444_2X8_PADHI_BE = 0x1001, - V4L2_MBUS_FMT_RGB444_2X8_PADHI_LE = 0x1002, - V4L2_MBUS_FMT_RGB555_2X8_PADHI_BE = 0x1003, - V4L2_MBUS_FMT_RGB555_2X8_PADHI_LE = 0x1004, - V4L2_MBUS_FMT_BGR565_2X8_BE = 0x1005, - V4L2_MBUS_FMT_BGR565_2X8_LE = 0x1006, - V4L2_MBUS_FMT_RGB565_2X8_BE = 0x1007, - V4L2_MBUS_FMT_RGB565_2X8_LE = 0x1008, - V4L2_MBUS_FMT_RGB666_1X18 = 0x1009, - V4L2_MBUS_FMT_RGB888_1X24 = 0x100a, - V4L2_MBUS_FMT_RGB888_2X12_BE = 0x100b, - V4L2_MBUS_FMT_RGB888_2X12_LE = 0x100c, - V4L2_MBUS_FMT_ARGB8888_1X32 = 0x100d, +enum v4l2_mbus_pixelcode { + V4L2_MBUS_FROM_MEDIA_BUS_FMT(FIXED), - /* YUV (including grey) - next is 0x2024 */ - V4L2_MBUS_FMT_Y8_1X8 = 0x2001, - V4L2_MBUS_FMT_UV8_1X8 = 0x2015, - V4L2_MBUS_FMT_UYVY8_1_5X8 = 0x2002, - V4L2_MBUS_FMT_VYUY8_1_5X8 = 0x2003, - V4L2_MBUS_FMT_YUYV8_1_5X8 = 0x2004, - V4L2_MBUS_FMT_YVYU8_1_5X8 = 0x2005, - V4L2_MBUS_FMT_UYVY8_2X8 = 0x2006, - V4L2_MBUS_FMT_VYUY8_2X8 = 0x2007, - V4L2_MBUS_FMT_YUYV8_2X8 = 0x2008, - V4L2_MBUS_FMT_YVYU8_2X8 = 0x2009, - V4L2_MBUS_FMT_Y10_1X10 = 0x200a, - V4L2_MBUS_FMT_UYVY10_2X10 = 0x2018, - V4L2_MBUS_FMT_VYUY10_2X10 = 0x2019, - V4L2_MBUS_FMT_YUYV10_2X10 = 0x200b, - V4L2_MBUS_FMT_YVYU10_2X10 = 0x200c, - V4L2_MBUS_FMT_Y12_1X12 = 0x2013, - V4L2_MBUS_FMT_UYVY8_1X16 = 0x200f, - V4L2_MBUS_FMT_VYUY8_1X16 = 0x2010, - V4L2_MBUS_FMT_YUYV8_1X16 = 0x2011, - V4L2_MBUS_FMT_YVYU8_1X16 = 0x2012, - V4L2_MBUS_FMT_YDYUYDYV8_1X16 = 0x2014, - V4L2_MBUS_FMT_UYVY10_1X20 = 0x201a, - V4L2_MBUS_FMT_VYUY10_1X20 = 0x201b, - V4L2_MBUS_FMT_YUYV10_1X20 = 0x200d, - V4L2_MBUS_FMT_YVYU10_1X20 = 0x200e, - V4L2_MBUS_FMT_YUV10_1X30 = 0x2016, - V4L2_MBUS_FMT_AYUV8_1X32 = 0x2017, - V4L2_MBUS_FMT_UYVY12_2X12 = 0x201c, - V4L2_MBUS_FMT_VYUY12_2X12 = 0x201d, - V4L2_MBUS_FMT_YUYV12_2X12 = 0x201e, - V4L2_MBUS_FMT_YVYU12_2X12 = 0x201f, - V4L2_MBUS_FMT_UYVY12_1X24 = 0x2020, - V4L2_MBUS_FMT_VYUY12_1X24 = 0x2021, - V4L2_MBUS_FMT_YUYV12_1X24 = 0x2022, - V4L2_MBUS_FMT_YVYU12_1X24 = 0x2023, + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB444_2X8_PADHI_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB444_2X8_PADHI_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB555_2X8_PADHI_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB555_2X8_PADHI_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(BGR565_2X8_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(BGR565_2X8_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB565_2X8_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB565_2X8_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB666_1X18), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB888_1X24), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB888_2X12_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(RGB888_2X12_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(ARGB8888_1X32), - /* Bayer - next is 0x3019 */ - V4L2_MBUS_FMT_SBGGR8_1X8 = 0x3001, - V4L2_MBUS_FMT_SGBRG8_1X8 = 0x3013, - V4L2_MBUS_FMT_SGRBG8_1X8 = 0x3002, - V4L2_MBUS_FMT_SRGGB8_1X8 = 0x3014, - V4L2_MBUS_FMT_SBGGR10_ALAW8_1X8 = 0x3015, - V4L2_MBUS_FMT_SGBRG10_ALAW8_1X8 = 0x3016, - V4L2_MBUS_FMT_SGRBG10_ALAW8_1X8 = 0x3017, - V4L2_MBUS_FMT_SRGGB10_ALAW8_1X8 = 0x3018, - V4L2_MBUS_FMT_SBGGR10_DPCM8_1X8 = 0x300b, - V4L2_MBUS_FMT_SGBRG10_DPCM8_1X8 = 0x300c, - V4L2_MBUS_FMT_SGRBG10_DPCM8_1X8 = 0x3009, - V4L2_MBUS_FMT_SRGGB10_DPCM8_1X8 = 0x300d, - V4L2_MBUS_FMT_SBGGR10_2X8_PADHI_BE = 0x3003, - V4L2_MBUS_FMT_SBGGR10_2X8_PADHI_LE = 0x3004, - V4L2_MBUS_FMT_SBGGR10_2X8_PADLO_BE = 0x3005, - V4L2_MBUS_FMT_SBGGR10_2X8_PADLO_LE = 0x3006, - V4L2_MBUS_FMT_SBGGR10_1X10 = 0x3007, - V4L2_MBUS_FMT_SGBRG10_1X10 = 0x300e, - V4L2_MBUS_FMT_SGRBG10_1X10 = 0x300a, - V4L2_MBUS_FMT_SRGGB10_1X10 = 0x300f, - V4L2_MBUS_FMT_SBGGR12_1X12 = 0x3008, - V4L2_MBUS_FMT_SGBRG12_1X12 = 0x3010, - V4L2_MBUS_FMT_SGRBG12_1X12 = 0x3011, - V4L2_MBUS_FMT_SRGGB12_1X12 = 0x3012, + V4L2_MBUS_FROM_MEDIA_BUS_FMT(Y8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UV8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY8_1_5X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY8_1_5X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV8_1_5X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU8_1_5X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY8_2X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY8_2X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV8_2X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU8_2X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(Y10_1X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY10_2X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY10_2X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV10_2X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU10_2X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(Y12_1X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY8_1X16), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY8_1X16), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV8_1X16), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU8_1X16), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YDYUYDYV8_1X16), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY10_1X20), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY10_1X20), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV10_1X20), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU10_1X20), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUV10_1X30), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(AYUV8_1X32), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY12_2X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY12_2X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV12_2X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU12_2X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(UYVY12_1X24), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(VYUY12_1X24), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YUYV12_1X24), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(YVYU12_1X24), - /* JPEG compressed formats - next is 0x4002 */ - V4L2_MBUS_FMT_JPEG_1X8 = 0x4001, + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGBRG8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGRBG8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SRGGB8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_ALAW8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGBRG10_ALAW8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGRBG10_ALAW8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SRGGB10_ALAW8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_DPCM8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGBRG10_DPCM8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGRBG10_DPCM8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SRGGB10_DPCM8_1X8), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_2X8_PADHI_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_2X8_PADHI_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_2X8_PADLO_BE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_2X8_PADLO_LE), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR10_1X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGBRG10_1X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGRBG10_1X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SRGGB10_1X10), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SBGGR12_1X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGBRG12_1X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SGRBG12_1X12), + V4L2_MBUS_FROM_MEDIA_BUS_FMT(SRGGB12_1X12), - /* Vendor specific formats - next is 0x5002 */ + V4L2_MBUS_FROM_MEDIA_BUS_FMT(JPEG_1X8), - /* S5C73M3 sensor specific interleaved UYVY and JPEG */ - V4L2_MBUS_FMT_S5C_UYVY_JPEG_1X8 = 0x5001, + V4L2_MBUS_FROM_MEDIA_BUS_FMT(S5C_UYVY_JPEG_1X8), - /* HSV - next is 0x6002 */ - V4L2_MBUS_FMT_AHSV8888_1X32 = 0x6001, + V4L2_MBUS_FROM_MEDIA_BUS_FMT(AHSV8888_1X32), }; /** -- cgit v1.2.3 From 32b32ce84a02d5e3538941e71d2792ebbef8f795 Mon Sep 17 00:00:00 2001 From: Boris BREZILLON Date: Mon, 10 Nov 2014 14:28:28 -0300 Subject: [media] Make use of the new media_bus_format definitions Replace references to the v4l2_mbus_pixelcode enum with the new media_bus_format enum in all common headers. Signed-off-by: Boris Brezillon Acked-by: Sakari Ailus Acked-by: Hans Verkuil Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-mediabus.h | 2 +- include/media/v4l2-subdev.h | 2 +- include/uapi/linux/v4l2-subdev.h | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/media/v4l2-mediabus.h b/include/media/v4l2-mediabus.h index 395c4a95a42a..59d7397fb0ea 100644 --- a/include/media/v4l2-mediabus.h +++ b/include/media/v4l2-mediabus.h @@ -98,7 +98,7 @@ static inline void v4l2_fill_pix_format(struct v4l2_pix_format *pix_fmt, static inline void v4l2_fill_mbus_format(struct v4l2_mbus_framefmt *mbus_fmt, const struct v4l2_pix_format *pix_fmt, - enum v4l2_mbus_pixelcode code) + u32 code) { mbus_fmt->width = pix_fmt->width; mbus_fmt->height = pix_fmt->height; diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index d7465725773d..5860292d42eb 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -341,7 +341,7 @@ struct v4l2_subdev_video_ops { int (*query_dv_timings)(struct v4l2_subdev *sd, struct v4l2_dv_timings *timings); int (*enum_mbus_fmt)(struct v4l2_subdev *sd, unsigned int index, - enum v4l2_mbus_pixelcode *code); + u32 *code); int (*enum_mbus_fsizes)(struct v4l2_subdev *sd, struct v4l2_frmsizeenum *fsize); int (*g_mbus_fmt)(struct v4l2_subdev *sd, diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h index a619cdd300ac..e0a7e3da498a 100644 --- a/include/uapi/linux/v4l2-subdev.h +++ b/include/uapi/linux/v4l2-subdev.h @@ -68,7 +68,7 @@ struct v4l2_subdev_crop { * struct v4l2_subdev_mbus_code_enum - Media bus format enumeration * @pad: pad number, as reported by the media API * @index: format index during enumeration - * @code: format code (from enum v4l2_mbus_pixelcode) + * @code: format code (MEDIA_BUS_FMT_ definitions) */ struct v4l2_subdev_mbus_code_enum { __u32 pad; @@ -81,7 +81,7 @@ struct v4l2_subdev_mbus_code_enum { * struct v4l2_subdev_frame_size_enum - Media bus format enumeration * @pad: pad number, as reported by the media API * @index: format index during enumeration - * @code: format code (from enum v4l2_mbus_pixelcode) + * @code: format code (MEDIA_BUS_FMT_ definitions) */ struct v4l2_subdev_frame_size_enum { __u32 index; @@ -109,7 +109,7 @@ struct v4l2_subdev_frame_interval { * struct v4l2_subdev_frame_interval_enum - Frame interval enumeration * @pad: pad number, as reported by the media API * @index: frame interval index during enumeration - * @code: format code (from enum v4l2_mbus_pixelcode) + * @code: format code (MEDIA_BUS_FMT_ definitions) * @width: frame width in pixels * @height: frame height in pixels * @interval: frame interval in seconds -- cgit v1.2.3 From aa1b4da6af67d5c6a4e933c2d30890c6f282f505 Mon Sep 17 00:00:00 2001 From: Boris BREZILLON Date: Mon, 10 Nov 2014 14:28:35 -0300 Subject: [media] v4l: Forbid usage of V4L2_MBUS_FMT definitions inside the kernel Place v4l2_mbus_pixelcode in a #ifndef __KERNEL__ section so that kernel users don't have access to these definitions. We have to keep this definition for user-space users even though they're encouraged to move to the new media_bus_format enum. Signed-off-by: Boris Brezillon Acked-by: Sakari Ailus Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-mediabus.h | 45 ++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h index 26180844c6b8..b1934a3961e4 100644 --- a/include/uapi/linux/v4l2-mediabus.h +++ b/include/uapi/linux/v4l2-mediabus.h @@ -15,6 +15,33 @@ #include #include +/** + * struct v4l2_mbus_framefmt - frame format on the media bus + * @width: frame width + * @height: frame height + * @code: data format code (from enum v4l2_mbus_pixelcode) + * @field: used interlacing type (from enum v4l2_field) + * @colorspace: colorspace of the data (from enum v4l2_colorspace) + */ +struct v4l2_mbus_framefmt { + __u32 width; + __u32 height; + __u32 code; + __u32 field; + __u32 colorspace; + __u32 reserved[7]; +}; + +#ifndef __KERNEL__ +/* + * enum v4l2_mbus_pixelcode and its definitions are now deprecated, and + * MEDIA_BUS_FMT_ definitions (defined in media-bus-format.h) should be + * used instead. + * + * New defines should only be added to media-bus-format.h. The + * v4l2_mbus_pixelcode enum is frozen. + */ + #define V4L2_MBUS_FROM_MEDIA_BUS_FMT(name) \ V4L2_MBUS_FMT_ ## name = MEDIA_BUS_FMT_ ## name @@ -102,22 +129,6 @@ enum v4l2_mbus_pixelcode { V4L2_MBUS_FROM_MEDIA_BUS_FMT(AHSV8888_1X32), }; - -/** - * struct v4l2_mbus_framefmt - frame format on the media bus - * @width: frame width - * @height: frame height - * @code: data format code (from enum v4l2_mbus_pixelcode) - * @field: used interlacing type (from enum v4l2_field) - * @colorspace: colorspace of the data (from enum v4l2_colorspace) - */ -struct v4l2_mbus_framefmt { - __u32 width; - __u32 height; - __u32 code; - __u32 field; - __u32 colorspace; - __u32 reserved[7]; -}; +#endif /* __KERNEL__ */ #endif -- cgit v1.2.3 From f622b429dadf83c3cc2d70f57f407ad85684eb36 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Tue, 4 Nov 2014 16:51:22 +0800 Subject: sched: Update comments about CLONE_NEWUTS and CLONE_NEWIPC Remove question mark: s/New utsname group?/New utsname namespace Unified style for IPC: s/New ipcs/New ipc namespace Signed-off-by: Chen Hanxiao Acked-by: Serge E. Hallyn Signed-off-by: Peter Zijlstra (Intel) Cc: Jiri Kosina Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1415091082-15093-1-git-send-email-chenhanxiao@cn.fujitsu.com Signed-off-by: Ingo Molnar --- include/uapi/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b932be9f5c5b..cc89ddefa926 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -23,8 +23,8 @@ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) and is now available for re-use. */ -#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ -#define CLONE_NEWIPC 0x08000000 /* New ipcs */ +#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ #define CLONE_NEWPID 0x20000000 /* New pid namespace */ #define CLONE_NEWNET 0x40000000 /* New network namespace */ -- cgit v1.2.3 From 60e2364e60e86e81bc6377f49779779e6120977f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 24 Sep 2014 13:48:37 +0200 Subject: perf: Add ability to sample machine state on interrupt Enable capture of interrupted machine state for each sample. Registers to sample are passed per event in the sample_regs_intr bitmask. To sample interrupt machine state, the PERF_SAMPLE_INTR_REGS must be passed in sample_type. The list of available registers is arch dependent and provided by asm/perf_regs.h Registers are laid out as u64 in the order of the bit order of sample_intr_regs. This patch also adds a new ABI version PERF_ATTR_SIZE_VER4 because we extend the perf_event_attr struct with a new u64 field. Reviewed-by: Jiri Olsa Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: cebbert.lkml@gmail.com Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1411559322-16548-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 7 +++++-- include/uapi/linux/perf_event.h | 15 +++++++++++++- kernel/events/core.c | 46 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 63 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 893a0d07986f..68d46d536e24 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -79,7 +79,7 @@ struct perf_branch_stack { struct perf_branch_entry entries[0]; }; -struct perf_regs_user { +struct perf_regs { __u64 abi; struct pt_regs *regs; }; @@ -600,7 +600,8 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; - struct perf_regs_user regs_user; + struct perf_regs regs_user; + struct perf_regs regs_intr; u64 stack_user_size; u64 weight; /* @@ -630,6 +631,8 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->weight = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; + data->regs_intr.abi = PERF_SAMPLE_REGS_ABI_NONE; + data->regs_intr.regs = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9d845404d875..9b79abbd1ab8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -137,8 +137,9 @@ enum perf_event_sample_format { PERF_SAMPLE_DATA_SRC = 1U << 15, PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, + PERF_SAMPLE_REGS_INTR = 1U << 18, - PERF_SAMPLE_MAX = 1U << 18, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ }; /* @@ -238,6 +239,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ /* add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -334,6 +336,15 @@ struct perf_event_attr { /* Align to u64. */ __u32 __reserved_2; + /* + * Defines set of regs to dump for each sample + * state captured on: + * - precise = 0: PMU interrupt + * - precise > 0: sampled instruction + * + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_intr; }; #define perf_flags(attr) (*(&(attr)->read_format + 1)) @@ -686,6 +697,8 @@ enum perf_event_type { * { u64 weight; } && PERF_SAMPLE_WEIGHT * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cd5eef1fcdd..c2be1597ece7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4460,7 +4460,7 @@ perf_output_sample_regs(struct perf_output_handle *handle, } } -static void perf_sample_regs_user(struct perf_regs_user *regs_user, +static void perf_sample_regs_user(struct perf_regs *regs_user, struct pt_regs *regs) { if (!user_mode(regs)) { @@ -4476,6 +4476,14 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user, } } +static void perf_sample_regs_intr(struct perf_regs *regs_intr, + struct pt_regs *regs) +{ + regs_intr->regs = regs; + regs_intr->abi = perf_reg_abi(current); +} + + /* * Get remaining task size from user stack pointer. * @@ -4857,6 +4865,23 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_TRANSACTION) perf_output_put(handle, data->txn); + if (sample_type & PERF_SAMPLE_REGS_INTR) { + u64 abi = data->regs_intr.abi; + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_intr; + + perf_output_sample_regs(handle, + data->regs_intr.regs, + mask); + } + } + if (!event->attr.watermark) { int wakeup_events = event->attr.wakeup_events; @@ -4943,7 +4968,7 @@ void perf_prepare_sample(struct perf_event_header *header, * in case new sample type is added, because we could eat * up the rest of the sample size. */ - struct perf_regs_user *uregs = &data->regs_user; + struct perf_regs *uregs = &data->regs_user; u16 stack_size = event->attr.sample_stack_user; u16 size = sizeof(u64); @@ -4964,6 +4989,21 @@ void perf_prepare_sample(struct perf_event_header *header, data->stack_user_size = stack_size; header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_INTR) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_intr(&data->regs_intr, regs); + + if (data->regs_intr.regs) { + u64 mask = event->attr.sample_regs_intr; + + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -7151,6 +7191,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, ret = -EINVAL; } + if (attr->sample_type & PERF_SAMPLE_REGS_INTR) + ret = perf_reg_validate(attr->sample_regs_intr); out: return ret; -- cgit v1.2.3 From 0288d7183c41c0192d2963d44590f346f4aee917 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Mon, 17 Nov 2014 15:51:01 -0500 Subject: audit: convert status version to a feature bitmap The version field defined in the audit status structure was found to have limitations in terms of its expressibility of features supported. This is distict from the get/set features call to be able to command those features that are present. Converting this field from a version number to a feature bitmap will allow distributions to selectively backport and support certain features and will allow upstream to be able to deprecate features in the future. It will allow userspace clients to first query the kernel for which features are actually present and supported. Currently, EINVAL is returned rather than EOPNOTSUP, which isn't helpful in determining if there was an error in the command, or if it simply isn't supported yet. Past features are not represented by this bitmap, but their use may be converted to EOPNOTSUP if needed in the future. Since "version" is too generic to convert with a #define, use a union in the struct status, introducing the member "feature_bitmap" unionized with "version". Convert existing AUDIT_VERSION_* macros over to AUDIT_FEATURE_BITMAP* counterparts, leaving the former for backwards compatibility. Signed-off-by: Richard Guy Briggs [PM: minor whitespace tweaks] Signed-off-by: Paul Moore --- include/uapi/linux/audit.h | 17 +++++++++++++---- kernel/audit.c | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 4d100c841c80..2ccf19e06e15 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -322,9 +322,15 @@ enum { #define AUDIT_STATUS_BACKLOG_LIMIT 0x0010 #define AUDIT_STATUS_BACKLOG_WAIT_TIME 0x0020 -#define AUDIT_VERSION_BACKLOG_LIMIT 1 -#define AUDIT_VERSION_BACKLOG_WAIT_TIME 2 -#define AUDIT_VERSION_LATEST AUDIT_VERSION_BACKLOG_WAIT_TIME +#define AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT 0x00000001 +#define AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME 0x00000002 +#define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \ + AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME) + +/* deprecated: AUDIT_VERSION_* */ +#define AUDIT_VERSION_LATEST AUDIT_FEATURE_BITMAP_ALL +#define AUDIT_VERSION_BACKLOG_LIMIT AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT +#define AUDIT_VERSION_BACKLOG_WAIT_TIME AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME /* Failure-to-log actions */ #define AUDIT_FAIL_SILENT 0 @@ -403,7 +409,10 @@ struct audit_status { __u32 backlog_limit; /* waiting messages limit */ __u32 lost; /* messages lost */ __u32 backlog; /* messages waiting in queue */ - __u32 version; /* audit api version number */ + union { + __u32 version; /* deprecated: audit api version num */ + __u32 feature_bitmap; /* bitmap of kernel audit features */ + }; __u32 backlog_wait_time;/* message queue wait timeout */ }; diff --git a/kernel/audit.c b/kernel/audit.c index 53bb39bf79e2..7b83c557aee6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -842,7 +842,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); s.backlog = skb_queue_len(&audit_skb_queue); - s.version = AUDIT_VERSION_LATEST; + s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL; s.backlog_wait_time = audit_backlog_wait_time; audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s)); break; -- cgit v1.2.3 From ee1b58d36aa1b5a79eaba11f5c3633c88231da83 Mon Sep 17 00:00:00 2001 From: Qiaowei Ren Date: Fri, 14 Nov 2014 07:18:19 -0800 Subject: mpx: Extend siginfo structure to include bound violation information This patch adds new fields about bound violation into siginfo structure. si_lower and si_upper are respectively lower bound and upper bound when bound violation is caused. Signed-off-by: Qiaowei Ren Signed-off-by: Dave Hansen Cc: linux-mm@kvack.org Cc: linux-mips@linux-mips.org Cc: Dave Hansen Link: http://lkml.kernel.org/r/20141114151819.1908C900@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- include/uapi/asm-generic/siginfo.h | 9 ++++++++- kernel/signal.c | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index ba5be7fdbdfe..1e3552037a5a 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -91,6 +91,10 @@ typedef struct siginfo { int _trapno; /* TRAP # which caused the signal */ #endif short _addr_lsb; /* LSB of the reported address */ + struct { + void __user *_lower; + void __user *_upper; + } _addr_bnd; } _sigfault; /* SIGPOLL */ @@ -131,6 +135,8 @@ typedef struct siginfo { #define si_trapno _sifields._sigfault._trapno #endif #define si_addr_lsb _sifields._sigfault._addr_lsb +#define si_lower _sifields._sigfault._addr_bnd._lower +#define si_upper _sifields._sigfault._addr_bnd._upper #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #ifdef __ARCH_SIGSYS @@ -199,7 +205,8 @@ typedef struct siginfo { */ #define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ #define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ -#define NSIGSEGV 2 +#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ +#define NSIGSEGV 3 /* * SIGBUS si_codes diff --git a/kernel/signal.c b/kernel/signal.c index 8f0876f9f6dd..2c403a4c8bee 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2747,6 +2747,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) */ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif +#ifdef SEGV_BNDERR + err |= __put_user(from->si_lower, &to->si_lower); + err |= __put_user(from->si_upper, &to->si_upper); #endif break; case __SI_CHLD: -- cgit v1.2.3 From fe3d197f84319d3bce379a9c0dc17b1f48ad358c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 14 Nov 2014 07:18:29 -0800 Subject: x86, mpx: On-demand kernel allocation of bounds tables This is really the meat of the MPX patch set. If there is one patch to review in the entire series, this is the one. There is a new ABI here and this kernel code also interacts with userspace memory in a relatively unusual manner. (small FAQ below). Long Description: This patch adds two prctl() commands to provide enable or disable the management of bounds tables in kernel, including on-demand kernel allocation (See the patch "on-demand kernel allocation of bounds tables") and cleanup (See the patch "cleanup unused bound tables"). Applications do not strictly need the kernel to manage bounds tables and we expect some applications to use MPX without taking advantage of this kernel support. This means the kernel can not simply infer whether an application needs bounds table management from the MPX registers. The prctl() is an explicit signal from userspace. PR_MPX_ENABLE_MANAGEMENT is meant to be a signal from userspace to require kernel's help in managing bounds tables. PR_MPX_DISABLE_MANAGEMENT is the opposite, meaning that userspace don't want kernel's help any more. With PR_MPX_DISABLE_MANAGEMENT, the kernel won't allocate and free bounds tables even if the CPU supports MPX. PR_MPX_ENABLE_MANAGEMENT will fetch the base address of the bounds directory out of a userspace register (bndcfgu) and then cache it into a new field (->bd_addr) in the 'mm_struct'. PR_MPX_DISABLE_MANAGEMENT will set "bd_addr" to an invalid address. Using this scheme, we can use "bd_addr" to determine whether the management of bounds tables in kernel is enabled. Also, the only way to access that bndcfgu register is via an xsaves, which can be expensive. Caching "bd_addr" like this also helps reduce the cost of those xsaves when doing table cleanup at munmap() time. Unfortunately, we can not apply this optimization to #BR fault time because we need an xsave to get the value of BNDSTATUS. ==== Why does the hardware even have these Bounds Tables? ==== MPX only has 4 hardware registers for storing bounds information. If MPX-enabled code needs more than these 4 registers, it needs to spill them somewhere. It has two special instructions for this which allow the bounds to be moved between the bounds registers and some new "bounds tables". They are similar conceptually to a page fault and will be raised by the MPX hardware during both bounds violations or when the tables are not present. This patch handles those #BR exceptions for not-present tables by carving the space out of the normal processes address space (essentially calling the new mmap() interface indroduced earlier in this patch set.) and then pointing the bounds-directory over to it. The tables *need* to be accessed and controlled by userspace because the instructions for moving bounds in and out of them are extremely frequent. They potentially happen every time a register pointing to memory is dereferenced. Any direct kernel involvement (like a syscall) to access the tables would obviously destroy performance. ==== Why not do this in userspace? ==== This patch is obviously doing this allocation in the kernel. However, MPX does not strictly *require* anything in the kernel. It can theoretically be done completely from userspace. Here are a few ways this *could* be done. I don't think any of them are practical in the real-world, but here they are. Q: Can virtual space simply be reserved for the bounds tables so that we never have to allocate them? A: As noted earlier, these tables are *HUGE*. An X-GB virtual area needs 4*X GB of virtual space, plus 2GB for the bounds directory. If we were to preallocate them for the 128TB of user virtual address space, we would need to reserve 512TB+2GB, which is larger than the entire virtual address space today. This means they can not be reserved ahead of time. Also, a single process's pre-popualated bounds directory consumes 2GB of virtual *AND* physical memory. IOW, it's completely infeasible to prepopulate bounds directories. Q: Can we preallocate bounds table space at the same time memory is allocated which might contain pointers that might eventually need bounds tables? A: This would work if we could hook the site of each and every memory allocation syscall. This can be done for small, constrained applications. But, it isn't practical at a larger scale since a given app has no way of controlling how all the parts of the app might allocate memory (think libraries). The kernel is really the only place to intercept these calls. Q: Could a bounds fault be handed to userspace and the tables allocated there in a signal handler instead of in the kernel? A: (thanks to tglx) mmap() is not on the list of safe async handler functions and even if mmap() would work it still requires locking or nasty tricks to keep track of the allocation state there. Having ruled out all of the userspace-only approaches for managing bounds tables that we could think of, we create them on demand in the kernel. Based-on-patch-by: Qiaowei Ren Signed-off-by: Dave Hansen Cc: linux-mm@kvack.org Cc: linux-mips@linux-mips.org Cc: Dave Hansen Link: http://lkml.kernel.org/r/20141114151829.AD4310DE@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mmu_context.h | 7 ++ arch/x86/include/asm/mpx.h | 41 +++++++ arch/x86/include/asm/processor.h | 18 +++ arch/x86/kernel/setup.c | 2 + arch/x86/kernel/traps.c | 85 +++++++++++++- arch/x86/mm/mpx.c | 223 ++++++++++++++++++++++++++++++++++++- fs/exec.c | 2 + include/asm-generic/mmu_context.h | 5 + include/linux/mm_types.h | 4 + include/uapi/linux/prctl.h | 6 + kernel/sys.c | 12 ++ 11 files changed, 399 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 166af2a8e865..0b0ba91ff1ef 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -10,6 +10,7 @@ #include #include #include +#include #ifndef CONFIG_PARAVIRT #include @@ -102,4 +103,10 @@ do { \ } while (0) #endif +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + mpx_mm_init(mm); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 35bcb1cddf40..05eecbf8a484 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -5,6 +5,14 @@ #include #include +/* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ +#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1) +#define MPX_BNDCFG_ENABLE_FLAG 0x1 +#define MPX_BD_ENTRY_VALID_FLAG 0x1 + #ifdef CONFIG_X86_64 /* upper 28 bits [47:20] of the virtual address in 64-bit used to @@ -18,6 +26,7 @@ #define MPX_BT_ENTRY_OFFSET 17 #define MPX_BT_ENTRY_SHIFT 5 #define MPX_IGN_BITS 3 +#define MPX_BD_ENTRY_TAIL 3 #else @@ -26,23 +35,55 @@ #define MPX_BT_ENTRY_OFFSET 10 #define MPX_BT_ENTRY_SHIFT 4 #define MPX_IGN_BITS 2 +#define MPX_BD_ENTRY_TAIL 2 #endif #define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT)) #define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT)) +#define MPX_BNDSTA_TAIL 2 +#define MPX_BNDCFG_TAIL 12 +#define MPX_BNDSTA_ADDR_MASK (~((1UL<bd_addr != MPX_INVALID_BOUNDS_DIR); +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ + /* + * NULL is theoretically a valid place to put the bounds + * directory, so point this at an invalid address. + */ + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; +} #else static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, struct xsave_struct *xsave_buf) { return NULL; } +static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + return -EINVAL; +} +static inline int kernel_managing_mpx_tables(struct mm_struct *mm) +{ + return 0; +} +static inline void mpx_mm_init(struct mm_struct *mm) +{ +} #endif /* CONFIG_X86_INTEL_MPX */ #endif /* _ASM_X86_MPX_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6571aaabacb9..9617a1716813 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -954,6 +954,24 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); +/* Register/unregister a process' MPX related resource */ +#define MPX_ENABLE_MANAGEMENT(tsk) mpx_enable_management((tsk)) +#define MPX_DISABLE_MANAGEMENT(tsk) mpx_disable_management((tsk)) + +#ifdef CONFIG_X86_INTEL_MPX +extern int mpx_enable_management(struct task_struct *tsk); +extern int mpx_disable_management(struct task_struct *tsk); +#else +static inline int mpx_enable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +static inline int mpx_disable_management(struct task_struct *tsk) +{ + return -EINVAL; +} +#endif /* CONFIG_X86_INTEL_MPX */ + extern u16 amd_get_nb_id(int cpu); static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab08aa2276fb..214245d6b996 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -960,6 +960,8 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; + mpx_mm_init(&init_mm); + code_resource.start = __pa_symbol(_text); code_resource.end = __pa_symbol(_etext)-1; data_resource.start = __pa_symbol(_etext); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 0d0e922fafc1..651d5d4f7558 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -60,6 +60,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -228,7 +229,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) @@ -278,6 +278,89 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif +dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + struct xsave_struct *xsave_buf; + enum ctx_state prev_state; + struct bndcsr *bndcsr; + siginfo_t *info; + + prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "bounds", regs, error_code, + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) + goto exit; + conditional_sti(regs); + + if (!user_mode(regs)) + die("bounds", regs, error_code); + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) { + /* The exception is not from Intel MPX */ + goto exit_trap; + } + + /* + * We need to look at BNDSTATUS to resolve this exception. + * It is not directly accessible, though, so we need to + * do an xsave and then pull it out of the xsave buffer. + */ + fpu_save_init(&tsk->thread.fpu); + xsave_buf = &(tsk->thread.fpu.state->xsave); + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + goto exit_trap; + + /* + * The error code field of the BNDSTATUS register communicates status + * information of a bound range exception #BR or operation involving + * bound directory. + */ + switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) { + case 2: /* Bound directory has invalid entry. */ + if (mpx_handle_bd_fault(xsave_buf)) + goto exit_trap; + break; /* Success, it was handled */ + case 1: /* Bound violation. */ + info = mpx_generate_siginfo(regs, xsave_buf); + if (PTR_ERR(info)) { + /* + * We failed to decode the MPX instruction. Act as if + * the exception was not caused by MPX. + */ + goto exit_trap; + } + /* + * Success, we decoded the instruction and retrieved + * an 'info' containing the address being accessed + * which caused the exception. This information + * allows and application to possibly handle the + * #BR exception itself. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, info); + kfree(info); + break; + case 0: /* No exception caused by Intel MPX operations. */ + goto exit_trap; + default: + die("bounds", regs, error_code); + } + +exit: + exception_exit(prev_state); + return; +exit_trap: + /* + * This path out is for all the cases where we could not + * handle the exception in some way (like allocating a + * table or telling userspace about it. We will also end + * up here if the kernel has MPX turned off at compile + * time.. + */ + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); + exception_exit(prev_state); +} + dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 9009e094d686..96266375441e 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -10,8 +10,12 @@ #include #include +#include +#include #include #include +#include +#include static const char *mpx_mapping_name(struct vm_area_struct *vma) { @@ -266,10 +270,11 @@ bad_opcode: siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, struct xsave_struct *xsave_buf) { + struct bndreg *bndregs, *bndreg; + siginfo_t *info = NULL; struct insn insn; uint8_t bndregno; int err; - siginfo_t *info; err = mpx_insn_decode(&insn, regs); if (err) @@ -285,6 +290,15 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, err = -EINVAL; goto err_out; } + /* get the bndregs _area_ of the xsave structure */ + bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS); + if (!bndregs) { + err = -EINVAL; + goto err_out; + } + /* now go select the individual register in the set of 4 */ + bndreg = &bndregs[bndregno]; + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { err = -ENOMEM; @@ -300,10 +314,8 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, * complains when casting from integers to different-size * pointers. */ - info->si_lower = (void __user *)(unsigned long) - (xsave_buf->bndreg[bndregno].lower_bound); - info->si_upper = (void __user *)(unsigned long) - (~xsave_buf->bndreg[bndregno].upper_bound); + info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound; + info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound; info->si_addr_lsb = 0; info->si_signo = SIGSEGV; info->si_errno = 0; @@ -319,5 +331,206 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs, } return info; err_out: + /* info might be NULL, but kfree() handles that */ + kfree(info); return ERR_PTR(err); } + +static __user void *task_get_bounds_dir(struct task_struct *tsk) +{ + struct bndcsr *bndcsr; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * The bounds directory pointer is stored in a register + * only accessible if we first do an xsave. + */ + fpu_save_init(&tsk->thread.fpu); + bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR); + if (!bndcsr) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Make sure the register looks valid by checking the + * enable bit. + */ + if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG)) + return MPX_INVALID_BOUNDS_DIR; + + /* + * Lastly, mask off the low bits used for configuration + * flags, and return the address of the bounds table. + */ + return (void __user *)(unsigned long) + (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK); +} + +int mpx_enable_management(struct task_struct *tsk) +{ + void __user *bd_base = MPX_INVALID_BOUNDS_DIR; + struct mm_struct *mm = tsk->mm; + int ret = 0; + + /* + * runtime in the userspace will be responsible for allocation of + * the bounds directory. Then, it will save the base of the bounds + * directory into XSAVE/XRSTOR Save Area and enable MPX through + * XRSTOR instruction. + * + * fpu_xsave() is expected to be very expensive. Storing the bounds + * directory here means that we do not have to do xsave in the unmap + * path; we can just use mm->bd_addr instead. + */ + bd_base = task_get_bounds_dir(tsk); + down_write(&mm->mmap_sem); + mm->bd_addr = bd_base; + if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + ret = -ENXIO; + + up_write(&mm->mmap_sem); + return ret; +} + +int mpx_disable_management(struct task_struct *tsk) +{ + struct mm_struct *mm = current->mm; + + if (!cpu_feature_enabled(X86_FEATURE_MPX)) + return -ENXIO; + + down_write(&mm->mmap_sem); + mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + up_write(&mm->mmap_sem); + return 0; +} + +/* + * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each + * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB, + * and the size of each bounds table is 4MB. + */ +static int allocate_bt(long __user *bd_entry) +{ + unsigned long expected_old_val = 0; + unsigned long actual_old_val = 0; + unsigned long bt_addr; + int ret = 0; + + /* + * Carve the virtual space out of userspace for the new + * bounds table: + */ + bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES); + if (IS_ERR((void *)bt_addr)) + return PTR_ERR((void *)bt_addr); + /* + * Set the valid flag (kinda like _PAGE_PRESENT in a pte) + */ + bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG; + + /* + * Go poke the address of the new bounds table in to the + * bounds directory entry out in userspace memory. Note: + * we may race with another CPU instantiating the same table. + * In that case the cmpxchg will see an unexpected + * 'actual_old_val'. + * + * This can fault, but that's OK because we do not hold + * mmap_sem at this point, unlike some of the other part + * of the MPX code that have to pagefault_disable(). + */ + ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry, + expected_old_val, bt_addr); + if (ret) + goto out_unmap; + + /* + * The user_atomic_cmpxchg_inatomic() will only return nonzero + * for faults, *not* if the cmpxchg itself fails. Now we must + * verify that the cmpxchg itself completed successfully. + */ + /* + * We expected an empty 'expected_old_val', but instead found + * an apparently valid entry. Assume we raced with another + * thread to instantiate this table and desclare succecss. + */ + if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) { + ret = 0; + goto out_unmap; + } + /* + * We found a non-empty bd_entry but it did not have the + * VALID_FLAG set. Return an error which will result in + * a SEGV since this probably means that somebody scribbled + * some invalid data in to a bounds table. + */ + if (expected_old_val != actual_old_val) { + ret = -EINVAL; + goto out_unmap; + } + return 0; +out_unmap: + vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES); + return ret; +} + +/* + * When a BNDSTX instruction attempts to save bounds to a bounds + * table, it will first attempt to look up the table in the + * first-level bounds directory. If it does not find a table in + * the directory, a #BR is generated and we get here in order to + * allocate a new table. + * + * With 32-bit mode, the size of BD is 4MB, and the size of each + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB, + * and the size of each bound table is 4MB. + */ +static int do_mpx_bt_fault(struct xsave_struct *xsave_buf) +{ + unsigned long bd_entry, bd_base; + struct bndcsr *bndcsr; + + bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR); + if (!bndcsr) + return -EINVAL; + /* + * Mask off the preserve and enable bits + */ + bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK; + /* + * The hardware provides the address of the missing or invalid + * entry via BNDSTATUS, so we don't have to go look it up. + */ + bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK; + /* + * Make sure the directory entry is within where we think + * the directory is. + */ + if ((bd_entry < bd_base) || + (bd_entry >= bd_base + MPX_BD_SIZE_BYTES)) + return -EINVAL; + + return allocate_bt((long __user *)bd_entry); +} + +int mpx_handle_bd_fault(struct xsave_struct *xsave_buf) +{ + /* + * Userspace never asked us to manage the bounds tables, + * so refuse to help. + */ + if (!kernel_managing_mpx_tables(current->mm)) + return -EINVAL; + + if (do_mpx_bt_fault(xsave_buf)) { + force_sig(SIGSEGV, current); + /* + * The force_sig() is essentially "handling" this + * exception, so we do not pass up the error + * from do_mpx_bt_fault(). + */ + } + return 0; +} diff --git a/fs/exec.c b/fs/exec.c index 7302b75a9820..65d4f5c70ef4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include "internal.h" @@ -277,6 +278,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) goto err; mm->stack_vm = mm->total_vm = 1; + arch_bprm_mm_init(mm, vma); up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); return 0; diff --git a/include/asm-generic/mmu_context.h b/include/asm-generic/mmu_context.h index a7eec910ba6c..1f2a8f9c9264 100644 --- a/include/asm-generic/mmu_context.h +++ b/include/asm-generic/mmu_context.h @@ -42,4 +42,9 @@ static inline void activate_mm(struct mm_struct *prev_mm, { } +static inline void arch_bprm_mm_init(struct mm_struct *mm, + struct vm_area_struct *vma) +{ +} + #endif /* __ASM_GENERIC_MMU_CONTEXT_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e0b286649f1..004e9d17b47e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -454,6 +454,10 @@ struct mm_struct { bool tlb_flush_pending; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_X86_INTEL_MPX + /* address of the bounds directory */ + void __user *bd_addr; +#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 513df75d0fc9..89f63503f903 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -179,4 +179,10 @@ struct prctl_mm_map { #define PR_SET_THP_DISABLE 41 #define PR_GET_THP_DISABLE 42 +/* + * Tell the kernel to start/stop helping userspace manage bounds tables. + */ +#define PR_MPX_ENABLE_MANAGEMENT 43 +#define PR_MPX_DISABLE_MANAGEMENT 44 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 1eaa2f0b0246..a8c9f5a7dda6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -91,6 +91,12 @@ #ifndef SET_TSC_CTL # define SET_TSC_CTL(a) (-EINVAL) #endif +#ifndef MPX_ENABLE_MANAGEMENT +# define MPX_ENABLE_MANAGEMENT(a) (-EINVAL) +#endif +#ifndef MPX_DISABLE_MANAGEMENT +# define MPX_DISABLE_MANAGEMENT(a) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2203,6 +2209,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, me->mm->def_flags &= ~VM_NOHUGEPAGE; up_write(&me->mm->mmap_sem); break; + case PR_MPX_ENABLE_MANAGEMENT: + error = MPX_ENABLE_MANAGEMENT(me); + break; + case PR_MPX_DISABLE_MANAGEMENT: + error = MPX_DISABLE_MANAGEMENT(me); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 3274f52073d88b62f3c5ace82ae9d48546232e72 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:44 -0800 Subject: bpf: add 'flags' attribute to BPF_MAP_UPDATE_ELEM command the current meaning of BPF_MAP_UPDATE_ELEM syscall command is: either update existing map element or create a new one. Initially the plan was to add a new command to handle the case of 'create new element if it didn't exist', but 'flags' style looks cleaner and overall diff is much smaller (more code reused), so add 'flags' attribute to BPF_MAP_UPDATE_ELEM command with the following meaning: #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ bpf_update_elem(fd, key, value, BPF_NOEXIST) call can fail with EEXIST if element already exists. bpf_update_elem(fd, key, value, BPF_EXIST) can fail with ENOENT if element doesn't exist. Userspace will call it as: int bpf_update_elem(int fd, void *key, void *value, __u64 flags) { union bpf_attr attr = { .map_fd = fd, .key = ptr_to_u64(key), .value = ptr_to_u64(value), .flags = flags; }; return bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } First two bits of 'flags' are used to encode style of bpf_update_elem() command. Bits 2-63 are reserved for future use. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 8 +++++++- kernel/bpf/syscall.c | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf91754a957..51e9242e4803 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -22,7 +22,7 @@ struct bpf_map_ops { /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); - int (*map_update_elem)(struct bpf_map *map, void *key, void *value); + int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d18316f9e9c4..3e9e1b77f29d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -82,7 +82,7 @@ enum bpf_cmd { /* create or update key/value pair in a given map * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size) - * Using attr->map_fd, attr->key, attr->value + * Using attr->map_fd, attr->key, attr->value, attr->flags * returns zero or negative error */ BPF_MAP_UPDATE_ELEM, @@ -117,6 +117,11 @@ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, }; +/* flags for BPF_MAP_UPDATE_ELEM command */ +#define BPF_ANY 0 /* create new element or update existing */ +#define BPF_NOEXIST 1 /* create new element if it didn't exist */ +#define BPF_EXIST 2 /* update existing element */ + union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ @@ -132,6 +137,7 @@ union bpf_attr { __aligned_u64 value; __aligned_u64 next_key; }; + __u64 flags; }; struct { /* anonymous struct used by BPF_PROG_LOAD command */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ba61c8c16032..c0d03bf317a2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -190,7 +190,7 @@ err_put: return err; } -#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags static int map_update_elem(union bpf_attr *attr) { @@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr) * therefore all map accessors rely on this fact, so do the same here */ rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value); + err = map->ops->map_update_elem(map, key, value, attr->flags); rcu_read_unlock(); free_value: -- cgit v1.2.3 From 0f8e4bd8a1fc8c4185f1630061d0a1f2d197a475 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:45 -0800 Subject: bpf: add hashtable type of eBPF maps add new map type BPF_MAP_TYPE_HASH and its implementation - maps are created/destroyed by userspace. Both userspace and eBPF programs can lookup/update/delete elements from the map - eBPF programs can be called in_irq(), so use spin_lock_irqsave() mechanism for concurrent updates - key/value are opaque range of bytes (aligned to 8 bytes) - user space provides 3 configuration attributes via BPF syscall: key_size, value_size, max_entries - map takes care of allocating/freeing key/value pairs - map_update_elem() must fail to insert new element when max_entries limit is reached to make sure that eBPF programs cannot exhaust memory - map_update_elem() replaces elements in an atomic way - optimized for speed of lookup() which can be called multiple times from eBPF program which itself is triggered by high volume of events . in the future JIT compiler may recognize lookup() call and optimize it further, since key_size is constant for life of eBPF program Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/hashtab.c | 362 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/hashtab.c (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3e9e1b77f29d..03a01fd609aa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -111,6 +111,7 @@ enum bpf_cmd { enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0daf7f6ae7df..2c0ec7f9da78 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000000..d234a012f046 --- /dev/null +++ b/kernel/bpf/hashtab.c @@ -0,0 +1,362 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include + +struct bpf_htab { + struct bpf_map map; + struct hlist_head *buckets; + spinlock_t lock; + u32 count; /* number of elements in this hashtable */ + u32 n_buckets; /* number of hash buckets */ + u32 elem_size; /* size of each element in bytes */ +}; + +/* each htab element is struct htab_elem + key + value */ +struct htab_elem { + struct hlist_node hash_node; + struct rcu_head rcu; + u32 hash; + char key[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int err, i; + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + err = -EINVAL; + if (htab->map.max_entries == 0 || htab->map.key_size == 0 || + htab->map.value_size == 0) + goto free_htab; + + /* hash table size must be power of 2 */ + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + + err = -E2BIG; + if (htab->map.key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + goto free_htab; + + err = -ENOMEM; + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + GFP_USER | __GFP_NOWARN); + + if (!htab->buckets) { + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + if (!htab->buckets) + goto free_htab; + } + + for (i = 0; i < htab->n_buckets; i++) + INIT_HLIST_HEAD(&htab->buckets[i]); + + spin_lock_init(&htab->lock); + htab->count = 0; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + return &htab->map; + +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, + void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* Called from syscall or from eBPF program */ +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 hash, key_size; + + /* Must be called with rcu_read_lock. */ + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) + return l->key + round_up(map->key_size, 8); + + return NULL; +} + +/* Called from syscall */ +static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l, *next_l; + u32 hash, key_size; + int i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + /* lookup the key */ + l = lookup_elem_raw(head, hash, key, key_size); + + if (!l) { + i = 0; + goto find_first_elem; + } + + /* key was found, get next key in the same bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + + if (next_l) { + /* if next elem in this hash list is non-zero, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* itereated over all buckets and all elements */ + return -ENOENT; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old; + struct hlist_head *head; + unsigned long flags; + u32 key_size; + int ret; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* allocate new element outside of lock */ + l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + if (!l_new) + return -ENOMEM; + + key_size = map->key_size; + + memcpy(l_new->key, key, key_size); + memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + + l_new->hash = htab_map_hash(l_new->key, key_size); + + /* bpf_map_update_elem() can be called in_irq() */ + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, l_new->hash); + + l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + + if (!l_old && unlikely(htab->count >= map->max_entries)) { + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + ret = -E2BIG; + goto err; + } + + if (l_old && map_flags == BPF_NOEXIST) { + /* elem already exists */ + ret = -EEXIST; + goto err; + } + + if (!l_old && map_flags == BPF_EXIST) { + /* elem doesn't exist, cannot update it */ + ret = -ENOENT; + goto err; + } + + /* add new element to the head of the list, so that concurrent + * search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + hlist_del_rcu(&l_old->hash_node); + kfree_rcu(l_old, rcu); + } else { + htab->count++; + } + spin_unlock_irqrestore(&htab->lock, flags); + + return 0; +err: + spin_unlock_irqrestore(&htab->lock, flags); + kfree(l_new); + return ret; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree_rcu(l, rcu); + ret = 0; + } + + spin_unlock_irqrestore(&htab->lock, flags); + return ret; +} + +static void delete_all_elements(struct bpf_htab *htab) +{ + int i; + + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree(l); + } + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + /* some of kfree_rcu() callbacks for elements of this map may not have + * executed. It's ok. Proceed to free residual elements and map itself + */ + delete_all_elements(htab); + kvfree(htab->buckets); + kfree(htab); +} + +static struct bpf_map_ops htab_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_map_lookup_elem, + .map_update_elem = htab_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &htab_ops, + .type = BPF_MAP_TYPE_HASH, +}; + +static int __init register_htab_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_htab_map); -- cgit v1.2.3 From 28fbcfa08d8ed7c5a50d41a0433aad222835e8e3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:46 -0800 Subject: bpf: add array type of eBPF maps add new map type BPF_MAP_TYPE_ARRAY and its implementation - optimized for fastest possible lookup() . in the future verifier/JIT may recognize lookup() with constant key and optimize it into constant pointer. Can optimize non-constant key into direct pointer arithmetic as well, since pointers and value_size are constant for the life of the eBPF program. In other words array_map_lookup_elem() may be 'inlined' by verifier/JIT while preserving concurrent access to this map from user space - two main use cases for array type: . 'global' eBPF variables: array of 1 element with key=0 and value is a collection of 'global' variables which programs can use to keep the state between events . aggregation of tracing events into fixed set of buckets - all array elements pre-allocated and zero initialized at init time - key as an index in array and can only be 4 byte - map_delete_elem() returns EINVAL, since elements cannot be deleted - map_update_elem() replaces elements in an non-atomic way (for atomic updates hashtable type should be used instead) Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 2 +- kernel/bpf/arraymap.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/arraymap.c (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 03a01fd609aa..0d662fe75df5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -112,6 +112,7 @@ enum bpf_cmd { enum bpf_map_type { BPF_MAP_TYPE_UNSPEC, BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 2c0ec7f9da78..72ec98ba2d42 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000000..58b80c137afd --- /dev/null +++ b/kernel/bpf/arraymap.c @@ -0,0 +1,151 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include + +struct bpf_array { + struct bpf_map map; + u32 elem_size; + char value[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array *array; + u32 elem_size; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + elem_size = round_up(attr->value_size, 8); + + /* allocate all map elements and zero-initialize them */ + array = kzalloc(sizeof(*array) + attr->max_entries * elem_size, + GFP_USER | __GFP_NOWARN); + if (!array) { + array = vzalloc(array->map.max_entries * array->elem_size); + if (!array) + return ERR_PTR(-ENOMEM); + } + + /* copy mandatory map attributes */ + array->map.key_size = attr->key_size; + array->map.value_size = attr->value_size; + array->map.max_entries = attr->max_entries; + + array->elem_size = elem_size; + + return &array->map; + +} + +/* Called from syscall or from eBPF program */ +static void *array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return NULL; + + return array->value + array->elem_size * index; +} + +/* Called from syscall */ +static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + if (index >= array->map.max_entries) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (map_flags == BPF_NOEXIST) + /* all elemenets already exist */ + return -EEXIST; + + memcpy(array->value + array->elem_size * index, value, array->elem_size); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding programs to complete + * and free the array + */ + synchronize_rcu(); + + kvfree(array); +} + +static struct bpf_map_ops array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list tl = { + .ops = &array_ops, + .type = BPF_MAP_TYPE_ARRAY, +}; + +static int __init register_array_map(void) +{ + bpf_register_map_type(&tl); + return 0; +} +late_initcall(register_array_map); -- cgit v1.2.3 From d0003ec01c667b731c139e23de3306a8b328ccf5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Nov 2014 17:36:49 -0800 Subject: bpf: allow eBPF programs to use maps expose bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() map accessors to eBPF programs Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 3 ++ kernel/bpf/Makefile | 2 +- kernel/bpf/helpers.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/helpers.c (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 51e9242e4803..75e94eaa228b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -133,4 +133,9 @@ struct bpf_prog *bpf_prog_get(u32 ufd); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); +/* verifier prototypes for helper functions called from eBPF programs */ +extern struct bpf_func_proto bpf_map_lookup_elem_proto; +extern struct bpf_func_proto bpf_map_update_elem_proto; +extern struct bpf_func_proto bpf_map_delete_elem_proto; + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0d662fe75df5..4a3d0f84f178 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -158,6 +158,9 @@ union bpf_attr { */ enum bpf_func_id { BPF_FUNC_unspec, + BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ + BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ + BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 72ec98ba2d42..a5ae60f0b0a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,5 @@ obj-y := core.o -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o ifdef CONFIG_TEST_BPF obj-$(CONFIG_BPF_SYSCALL) += test_stub.o endif diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000000..9e3414d85459 --- /dev/null +++ b/kernel/bpf/helpers.c @@ -0,0 +1,89 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include + +/* If kernel subsystem is allowing eBPF programs to call this function, + * inside its own verifier_ops->get_func_proto() callback it should return + * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock + * if program is allowed to access maps, so check rcu_read_lock_held in + * all three functions. + */ +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* verifier checked that R1 contains a valid pointer to bpf_map + * and R2 points to a program stack and map->key_size bytes were + * initialized + */ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + value = map->ops->map_lookup_elem(map, key); + + /* lookup() returns either pointer to element value or NULL + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type + */ + return (unsigned long) value; +} + +struct bpf_func_proto bpf_map_lookup_elem_proto = { + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value = (void *) (unsigned long) r3; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_update_elem(map, key, value, r4); +} + +struct bpf_func_proto bpf_map_update_elem_proto = { + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, +}; + +static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_delete_elem(map, key); +} + +struct bpf_func_proto bpf_map_delete_elem_proto = { + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; -- cgit v1.2.3 From d67ee213fa5700c7da526fe5bcccd485cfa63d8b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 28 Oct 2014 20:13:31 -0400 Subject: dm: add presuspend_undo hook to target_type The DM thin-pool target now must undo the changes performed during pool_presuspend() so introduce presuspend_undo hook in target_type. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-table.c | 36 +++++++++++++++++++++++++++++------- drivers/md/dm.c | 10 ++++++++-- drivers/md/dm.h | 1 + include/linux/device-mapper.h | 2 ++ include/uapi/linux/dm-ioctl.h | 4 ++-- 5 files changed, 42 insertions(+), 11 deletions(-) (limited to 'include/uapi') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index b2bd1ebf4562..3afae9e062f8 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1521,18 +1521,32 @@ fmode_t dm_table_get_mode(struct dm_table *t) } EXPORT_SYMBOL(dm_table_get_mode); -static void suspend_targets(struct dm_table *t, unsigned postsuspend) +enum suspend_mode { + PRESUSPEND, + PRESUSPEND_UNDO, + POSTSUSPEND, +}; + +static void suspend_targets(struct dm_table *t, enum suspend_mode mode) { int i = t->num_targets; struct dm_target *ti = t->targets; while (i--) { - if (postsuspend) { + switch (mode) { + case PRESUSPEND: + if (ti->type->presuspend) + ti->type->presuspend(ti); + break; + case PRESUSPEND_UNDO: + if (ti->type->presuspend_undo) + ti->type->presuspend_undo(ti); + break; + case POSTSUSPEND: if (ti->type->postsuspend) ti->type->postsuspend(ti); - } else if (ti->type->presuspend) - ti->type->presuspend(ti); - + break; + } ti++; } } @@ -1542,7 +1556,15 @@ void dm_table_presuspend_targets(struct dm_table *t) if (!t) return; - suspend_targets(t, 0); + suspend_targets(t, PRESUSPEND); +} + +void dm_table_presuspend_undo_targets(struct dm_table *t) +{ + if (!t) + return; + + suspend_targets(t, PRESUSPEND_UNDO); } void dm_table_postsuspend_targets(struct dm_table *t) @@ -1550,7 +1572,7 @@ void dm_table_postsuspend_targets(struct dm_table *t) if (!t) return; - suspend_targets(t, 1); + suspend_targets(t, POSTSUSPEND); } int dm_table_resume_targets(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f8cdd97c28a7..f84de3215982 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2756,7 +2756,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - /* This does not get reverted if there's an error later. */ + /* + * This gets reverted if there's an error later and the targets + * provide the .presuspend_undo hook. + */ dm_table_presuspend_targets(map); /* @@ -2767,8 +2770,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) */ if (!noflush && do_lockfs) { r = lock_fs(md); - if (r) + if (r) { + dm_table_presuspend_undo_targets(map); goto out_unlock; + } } /* @@ -2816,6 +2821,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) start_queue(md->queue); unlock_fs(md); + dm_table_presuspend_undo_targets(map); goto out_unlock; /* pushback list is already flushed, so skip flush */ } diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 988c7fb7b145..781994093bf5 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -65,6 +65,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits); struct list_head *dm_table_get_devices(struct dm_table *t); void dm_table_presuspend_targets(struct dm_table *t); +void dm_table_presuspend_undo_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e1707de043ae..ca6d2acc5eb7 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -64,6 +64,7 @@ typedef int (*dm_request_endio_fn) (struct dm_target *ti, union map_info *map_context); typedef void (*dm_presuspend_fn) (struct dm_target *ti); +typedef void (*dm_presuspend_undo_fn) (struct dm_target *ti); typedef void (*dm_postsuspend_fn) (struct dm_target *ti); typedef int (*dm_preresume_fn) (struct dm_target *ti); typedef void (*dm_resume_fn) (struct dm_target *ti); @@ -145,6 +146,7 @@ struct target_type { dm_endio_fn end_io; dm_request_endio_fn rq_end_io; dm_presuspend_fn presuspend; + dm_presuspend_undo_fn presuspend_undo; dm_postsuspend_fn postsuspend; dm_preresume_fn preresume; dm_resume_fn resume; diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 3315ab21f728..2be66f4be2f9 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 28 +#define DM_VERSION_MINOR 29 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2014-09-17)" +#define DM_VERSION_EXTRA "-ioctl (2014-10-28)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From ffcc39364160663cda1a3c358f4537302a92459b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 28 Oct 2014 18:34:52 -0400 Subject: dm: enhance internal suspend and resume interface Rename dm_internal_{suspend,resume} to dm_internal_{suspend,resume}_fast -- dm-stats will continue using these methods to avoid all the extra suspend/resume logic that is not needed in order to quickly flush IO. Introduce dm_internal_suspend_noflush() variant that actually calls the mapped_device's target callbacks -- otherwise target-specific hooks are avoided (e.g. dm-thin's thin_presuspend and thin_postsuspend). Common code between dm_internal_{suspend_noflush,resume} and dm_{suspend,resume} was factored out as __dm_{suspend,resume}. Update dm_internal_{suspend_noflush,resume} to always take and release the mapped_device's suspend_lock. Also update dm_{suspend,resume} to be aware of potential for DM_INTERNAL_SUSPEND_FLAG to be set and respond accordingly by interruptibly waiting for the DM_INTERNAL_SUSPEND_FLAG to be cleared. Add lockdep annotation to dm_suspend() and dm_resume(). The existing DM_SUSPEND_FLAG remains unchanged. DM_INTERNAL_SUSPEND_FLAG is set by dm_internal_suspend_noflush() and cleared by dm_internal_resume(). Both DM_SUSPEND_FLAG and DM_INTERNAL_SUSPEND_FLAG may be set if a device was already suspended when dm_internal_suspend_noflush() was called -- this can be thought of as a "nested suspend". A "nested suspend" can occur with legacy userspace dm-thin code that might suspend all active thin volumes before suspending the pool for resize. But otherwise, in the normal dm-thin-pool suspend case moving forward: the thin-pool will have DM_SUSPEND_FLAG set and all active thins from that thin-pool will have DM_INTERNAL_SUSPEND_FLAG set. Also add DM_INTERNAL_SUSPEND_FLAG to status report. This new DM_INTERNAL_SUSPEND_FLAG state is being reported to assist with debugging (e.g. 'dmsetup info' will report an internally suspended device accordingly). Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-ioctl.c | 5 +- drivers/md/dm-stats.c | 2 +- drivers/md/dm.c | 229 +++++++++++++++++++++++++++++++----------- drivers/md/dm.h | 9 ++ include/uapi/linux/dm-ioctl.h | 5 + 5 files changed, 192 insertions(+), 58 deletions(-) (limited to 'include/uapi') diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 0be9381365d7..73f791bb9ea4 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -684,11 +684,14 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) int srcu_idx; param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | - DM_ACTIVE_PRESENT_FLAG); + DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG); if (dm_suspended_md(md)) param->flags |= DM_SUSPEND_FLAG; + if (dm_suspended_internally_md(md)) + param->flags |= DM_INTERNAL_SUSPEND_FLAG; + if (dm_test_deferred_remove_flag(md)) param->flags |= DM_DEFERRED_REMOVE; diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 28a90122a5a8..42a057aa3811 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -824,7 +824,7 @@ static int message_stats_create(struct mapped_device *md, return 1; id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, - dm_internal_suspend, dm_internal_resume, md); + dm_internal_suspend_fast, dm_internal_resume_fast, md); if (id < 0) return id; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f84de3215982..a0ece87ad426 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_MERGE_IS_OPTIONAL 6 #define DMF_DEFERRED_REMOVE 7 +#define DMF_SUSPENDED_INTERNALLY 8 /* * A dummy definition to make RCU happy. @@ -2718,36 +2720,18 @@ static void unlock_fs(struct mapped_device *md) } /* - * We need to be able to change a mapping table under a mounted - * filesystem. For example we might want to move some data in - * the background. Before the table can be swapped with - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ -/* - * Suspend mechanism in request-based dm. - * - * 1. Flush all I/Os by lock_fs() if needed. - * 2. Stop dispatching any I/O by stopping the request_queue. - * 3. Wait for all in-flight I/Os to be completed or requeued. + * If __dm_suspend returns 0, the device is completely quiescent + * now. There is no request-processing activity. All new requests + * are being added to md->deferred list. * - * To abort suspend, start the request_queue. + * Caller must hold md->suspend_lock */ -int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +static int __dm_suspend(struct mapped_device *md, struct dm_table *map, + unsigned suspend_flags, int interruptible) { - struct dm_table *map = NULL; - int r = 0; - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - - mutex_lock(&md->suspend_lock); - - if (dm_suspended_md(md)) { - r = -EINVAL; - goto out_unlock; - } - - map = rcu_dereference(md->map); + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; + int r; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2772,7 +2756,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) r = lock_fs(md); if (r) { dm_table_presuspend_undo_targets(map); - goto out_unlock; + return r; } } @@ -2806,7 +2790,7 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); + r = dm_wait_for_completion(md, interruptible); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); @@ -2822,14 +2806,55 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags) unlock_fs(md); dm_table_presuspend_undo_targets(map); - goto out_unlock; /* pushback list is already flushed, so skip flush */ + /* pushback list is already flushed, so skip flush */ } - /* - * If dm_wait_for_completion returned 0, the device is completely - * quiescent now. There is no request-processing activity. All new - * requests are being added to md->deferred list. - */ + return r; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; + +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference(md->map); + + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); + if (r) + goto out_unlock; set_bit(DMF_SUSPENDED, &md->flags); @@ -2840,35 +2865,57 @@ out_unlock: return r; } +static int __dm_resume(struct mapped_device *md, struct dm_table *map) +{ + if (map) { + int r = dm_table_resume_targets(map); + if (r) + return r; + } + + dm_queue_flush(md); + + /* + * Flushing deferred I/Os must be done after targets are resumed + * so that mapping of targets can work correctly. + * Request-based dm is queueing the deferred I/Os in its request_queue. + */ + if (dm_request_based(md)) + start_queue(md->queue); + + unlock_fs(md); + + return 0; +} + int dm_resume(struct mapped_device *md) { int r = -EINVAL; struct dm_table *map = NULL; - mutex_lock(&md->suspend_lock); +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + if (!dm_suspended_md(md)) goto out; + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + map = rcu_dereference(md->map); if (!map || !dm_table_get_size(map)) goto out; - r = dm_table_resume_targets(map); + r = __dm_resume(md, map); if (r) goto out; - dm_queue_flush(md); - - /* - * Flushing deferred I/Os must be done after targets are resumed - * so that mapping of targets can work correctly. - * Request-based dm is queueing the deferred I/Os in its request_queue. - */ - if (dm_request_based(md)) - start_queue(md->queue); - - unlock_fs(md); - clear_bit(DMF_SUSPENDED, &md->flags); r = 0; @@ -2882,15 +2929,80 @@ out: * Internal suspend/resume works like userspace-driven suspend. It waits * until all bios finish and prevents issuing new bios to the target drivers. * It may be used only from the kernel. - * - * Internal suspend holds md->suspend_lock, which prevents interaction with - * userspace-driven suspend. */ -void dm_internal_suspend(struct mapped_device *md) +static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) { - mutex_lock(&md->suspend_lock); + struct dm_table *map = NULL; + + if (dm_suspended_internally_md(md)) + return; /* nested internal suspend */ + + if (dm_suspended_md(md)) { + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + return; /* nest suspend */ + } + + map = rcu_dereference(md->map); + + /* + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend + * would require changing .presuspend to return an error -- avoid this + * until there is a need for more elaborate variants of internal suspend. + */ + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); + + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + + dm_table_postsuspend_targets(map); +} + +static void __dm_internal_resume(struct mapped_device *md) +{ + if (!dm_suspended_internally_md(md)) + return; /* resume from nested internal suspend */ + if (dm_suspended_md(md)) + goto done; /* resume from nested suspend */ + + /* + * NOTE: existing callers don't need to call dm_table_resume_targets + * (which may fail -- so best to avoid it for now by passing NULL map) + */ + (void) __dm_resume(md, NULL); + +done: + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + smp_mb__after_atomic(); + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); +} + +void dm_internal_suspend_noflush(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); + +void dm_internal_resume(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_resume(md); + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume); + +/* + * Fast variants of internal suspend/resume hold md->suspend_lock, + * which prevents interaction with userspace-driven suspend. + */ + +void dm_internal_suspend_fast(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) return; set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); @@ -2899,9 +3011,9 @@ void dm_internal_suspend(struct mapped_device *md) dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); } -void dm_internal_resume(struct mapped_device *md) +void dm_internal_resume_fast(struct mapped_device *md) { - if (dm_suspended_md(md)) + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) goto done; dm_queue_flush(md); @@ -2987,6 +3099,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_suspended_internally_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); +} + int dm_test_deferred_remove_flag(struct mapped_device *md) { return test_bit(DMF_DEFERRED_REMOVE, &md->flags); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 781994093bf5..84b0f9e4ba6c 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -129,6 +129,15 @@ int dm_deleting_md(struct mapped_device *md); */ int dm_suspended_md(struct mapped_device *md); +/* + * Internal suspend and resume methods. + */ +int dm_suspended_internally_md(struct mapped_device *md); +void dm_internal_suspend_fast(struct mapped_device *md); +void dm_internal_resume_fast(struct mapped_device *md); +void dm_internal_suspend_noflush(struct mapped_device *md); +void dm_internal_resume(struct mapped_device *md); + /* * Test if the device is scheduled for deferred remove. */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 2be66f4be2f9..a570d7b5796c 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -352,4 +352,9 @@ enum { */ #define DM_DEFERRED_REMOVE (1 << 17) /* In/Out */ +/* + * If set, the device is suspended internally. + */ +#define DM_INTERNAL_SUSPEND_FLAG (1 << 18) /* Out */ + #endif /* _LINUX_DM_IOCTL_H */ -- cgit v1.2.3 From 78632a17eaa7a5abdc22aac8ca5932d6cad59984 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Sun, 9 Nov 2014 18:50:14 +0200 Subject: cfg/mac80211: define TDLS channel switch feature bit Define some related TDLS protocol constants and advertise channel switch support in the extended-capabilities IE when the feature bit is defined. Actually supporting TDLS channel-switching also requires support for some new nl80211 commands, to be introduced by future patches. Signed-off-by: Arik Nemtsov Signed-off-by: Arik Nemtsov Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 3 +++ net/mac80211/tdls.c | 9 ++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index adac1be67387..fbb02d240658 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2019,6 +2019,11 @@ enum ieee80211_tdls_actioncode { */ #define WLAN_EXT_CAPA1_EXT_CHANNEL_SWITCHING BIT(2) +/* TDLS capabilities in the the 4th byte of @WLAN_EID_EXT_CAPABILITY */ +#define WLAN_EXT_CAPA4_TDLS_BUFFER_STA BIT(4) +#define WLAN_EXT_CAPA4_TDLS_PEER_PSM BIT(5) +#define WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH BIT(6) + /* Interworking capabilities are set in 7th bit of 4th byte of the * @WLAN_EID_EXT_CAPABILITY information element */ @@ -2030,6 +2035,7 @@ enum ieee80211_tdls_actioncode { */ #define WLAN_EXT_CAPA5_TDLS_ENABLED BIT(5) #define WLAN_EXT_CAPA5_TDLS_PROHIBITED BIT(6) +#define WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED BIT(7) #define WLAN_EXT_CAPA8_OPMODE_NOTIF BIT(6) #define WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED BIT(7) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 442369f69b4f..ccdeef28d672 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4095,6 +4095,8 @@ enum nl80211_ap_sme_features { * @NL80211_FEATURE_MAC_ON_CREATE: Device supports configuring * the vif's MAC address upon creation. * See 'macaddr' field in the vif_params (cfg80211.h). + * @NL80211_FEATURE_TDLS_CHANNEL_SWITCH: Driver supports channel switching when + * operating as a TDLS peer. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4125,6 +4127,7 @@ enum nl80211_feature_flags { NL80211_FEATURE_DYNAMIC_SMPS = 1 << 25, NL80211_FEATURE_SUPPORTS_WMM_ADMISSION = 1 << 26, NL80211_FEATURE_MAC_ON_CREATE = 1 << 27, + NL80211_FEATURE_TDLS_CHANNEL_SWITCH = 1 << 28, }; /** diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 30a4c1004010..4554bdc72c91 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -35,16 +35,19 @@ void ieee80211_tdls_peer_del_work(struct work_struct *wk) mutex_unlock(&local->mtx); } -static void ieee80211_tdls_add_ext_capab(struct sk_buff *skb) +static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local, + struct sk_buff *skb) { u8 *pos = (void *)skb_put(skb, 7); + bool chan_switch = local->hw.wiphy->features & + NL80211_FEATURE_TDLS_CHANNEL_SWITCH; *pos++ = WLAN_EID_EXT_CAPABILITY; *pos++ = 5; /* len */ *pos++ = 0x0; *pos++ = 0x0; *pos++ = 0x0; - *pos++ = 0x0; + *pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0; *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; } @@ -289,7 +292,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } - ieee80211_tdls_add_ext_capab(skb); + ieee80211_tdls_add_ext_capab(local, skb); /* add the QoS element if we support it */ if (local->hw.queues >= IEEE80211_NUM_ACS && -- cgit v1.2.3 From 1057d35ede5dbf7ed7842357564fb42c9b54ba50 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 19 Nov 2014 12:54:26 +0200 Subject: cfg80211: introduce TDLS channel switch commands Introduce commands to initiate and cancel TDLS channel-switching. Once TDLS channel-switching is started, the lower level driver is responsible for continually initiating channel-switch operations and returning to the base (AP) channel to listen for beacons from time to time. Upon cancellation of the channel-switch all communication between the relevant TDLS peers will continue on the base channel. Signed-off-by: Arik Nemtsov Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++ include/uapi/linux/nl80211.h | 19 ++++++++ net/wireless/core.c | 4 ++ net/wireless/nl80211.c | 108 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/rdev-ops.h | 24 ++++++++++ net/wireless/trace.h | 42 +++++++++++++++++ 6 files changed, 211 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 220d5f5f1aca..8d04dfef32bf 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2367,6 +2367,12 @@ struct cfg80211_qos_map { * (invoked with the wireless_dev mutex held) * @leave_ocb: leave the current OCB network * (invoked with the wireless_dev mutex held) + * + * @tdls_channel_switch: Start channel-switching with a TDLS peer. The driver + * is responsible for continually initiating channel-switching operations + * and returning to the base channel for communication with the AP. + * @tdls_cancel_channel_switch: Stop channel-switching with a TDLS peer. Both + * peers must be on the base channel when the call completes. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -2622,6 +2628,14 @@ struct cfg80211_ops { u16 admitted_time); int (*del_tx_ts)(struct wiphy *wiphy, struct net_device *dev, u8 tsid, const u8 *peer); + + int (*tdls_channel_switch)(struct wiphy *wiphy, + struct net_device *dev, + const u8 *addr, u8 oper_class, + struct cfg80211_chan_def *chandef); + void (*tdls_cancel_channel_switch)(struct wiphy *wiphy, + struct net_device *dev, + const u8 *addr); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ccdeef28d672..365db67ca71d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -762,6 +762,18 @@ * @NL80211_CMD_LEAVE_OCB: Leave the OCB network -- no special arguments, the * network is determined by the network interface. * + * @NL80211_CMD_TDLS_CHANNEL_SWITCH: Start channel-switching with a TDLS peer, + * identified by the %NL80211_ATTR_MAC parameter. A target channel is + * provided via %NL80211_ATTR_WIPHY_FREQ and other attributes determining + * channel width/type. The target operating class is given via + * %NL80211_ATTR_OPER_CLASS. + * The driver is responsible for continually initiating channel-switching + * operations and returning to the base channel for communication with the + * AP. + * @NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH: Stop channel-switching with a TDLS + * peer given by %NL80211_ATTR_MAC. Both peers must be on the base channel + * when this command completes. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -943,6 +955,9 @@ enum nl80211_commands { NL80211_CMD_CH_SWITCH_STARTED_NOTIFY, + NL80211_CMD_TDLS_CHANNEL_SWITCH, + NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -1669,6 +1684,8 @@ enum nl80211_commands { * @NL80211_ATTR_SMPS_MODE: SMPS mode to use (ap mode). see * &enum nl80211_smps_mode. * + * @NL80211_ATTR_OPER_CLASS: operating class + * * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -2021,6 +2038,8 @@ enum nl80211_attrs { NL80211_ATTR_SMPS_MODE, + NL80211_ATTR_OPER_CLASS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index a4d27927aba2..4c2e501203d1 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -541,6 +541,10 @@ int wiphy_register(struct wiphy *wiphy) !wiphy->wowlan->tcp)) return -EINVAL; #endif + if (WARN_ON((wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH) && + (!rdev->ops->tdls_channel_switch || + !rdev->ops->tdls_cancel_channel_switch))) + return -EINVAL; if (WARN_ON(wiphy->coalesce && (!wiphy->coalesce->n_rules || diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d0a8361b3395..27666f5e5050 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9658,6 +9658,98 @@ static int nl80211_del_tx_ts(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl80211_tdls_channel_switch(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_chan_def chandef = {}; + const u8 *addr; + u8 oper_class; + int err; + + if (!rdev->ops->tdls_channel_switch || + !(rdev->wiphy.features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH)) + return -EOPNOTSUPP; + + switch (dev->ieee80211_ptr->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + break; + default: + return -EOPNOTSUPP; + } + + if (!info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_OPER_CLASS]) + return -EINVAL; + + err = nl80211_parse_chandef(rdev, info, &chandef); + if (err) + return err; + + /* + * Don't allow wide channels on the 2.4Ghz band, as per IEEE802.11-2012 + * section 10.22.6.2.1. Disallow 5/10Mhz channels as well for now, the + * specification is not defined for them. + */ + if (chandef.chan->band == IEEE80211_BAND_2GHZ && + chandef.width != NL80211_CHAN_WIDTH_20_NOHT && + chandef.width != NL80211_CHAN_WIDTH_20) + return -EINVAL; + + /* we will be active on the TDLS link */ + if (!cfg80211_reg_can_beacon(&rdev->wiphy, &chandef, wdev->iftype)) + return -EINVAL; + + /* don't allow switching to DFS channels */ + if (cfg80211_chandef_dfs_required(wdev->wiphy, &chandef, wdev->iftype)) + return -EINVAL; + + addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + oper_class = nla_get_u8(info->attrs[NL80211_ATTR_OPER_CLASS]); + + wdev_lock(wdev); + err = rdev_tdls_channel_switch(rdev, dev, addr, oper_class, &chandef); + wdev_unlock(wdev); + + return err; +} + +static int nl80211_tdls_cancel_channel_switch(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + const u8 *addr; + + if (!rdev->ops->tdls_channel_switch || + !rdev->ops->tdls_cancel_channel_switch || + !(rdev->wiphy.features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH)) + return -EOPNOTSUPP; + + switch (dev->ieee80211_ptr->iftype) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + break; + default: + return -EOPNOTSUPP; + } + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + + wdev_lock(wdev); + rdev_tdls_cancel_channel_switch(rdev, dev, addr); + wdev_unlock(wdev); + + return 0; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -10456,6 +10548,22 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_TDLS_CHANNEL_SWITCH, + .doit = nl80211_tdls_channel_switch, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH, + .doit = nl80211_tdls_cancel_channel_switch, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, }; /* notification functions */ diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 1b3864cd50ca..35cfb7134bdb 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -993,4 +993,28 @@ rdev_del_tx_ts(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *addr, + u8 oper_class, struct cfg80211_chan_def *chandef) +{ + int ret; + + trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, + chandef); + ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr, + oper_class, chandef); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + +static inline void +rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, + struct net_device *dev, const u8 *addr) +{ + trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); + rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); + trace_rdev_return_void(&rdev->wiphy); +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 6e25370d3ce7..ad38910f7036 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2032,6 +2032,48 @@ TRACE_EVENT(rdev_del_tx_ts, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer), __entry->tsid) ); +TRACE_EVENT(rdev_tdls_channel_switch, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *addr, u8 oper_class, + struct cfg80211_chan_def *chandef), + TP_ARGS(wiphy, netdev, addr, oper_class, chandef), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(addr) + __field(u8, oper_class) + CHAN_DEF_ENTRY + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(addr, addr); + CHAN_DEF_ASSIGN(chandef); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT + " oper class %d, " CHAN_DEF_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr), + __entry->oper_class, CHAN_DEF_PR_ARG) +); + +TRACE_EVENT(rdev_tdls_cancel_channel_switch, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + const u8 *addr), + TP_ARGS(wiphy, netdev, addr), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + MAC_ENTRY(addr) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + MAC_ASSIGN(addr, addr); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(addr)) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ -- cgit v1.2.3 From 8cd4d4563ef0a518002c4a8f47dd950afe386ea8 Mon Sep 17 00:00:00 2001 From: Luciano Coelho Date: Wed, 17 Sep 2014 11:55:28 +0300 Subject: cfg80211: add wowlan net-detect support Add a new WoWLAN API to enable net-detect as a wake up trigger. Net-detect allows the device to scan in the background while the host is asleep to wake up the host system when a matching network is found. Reuse the scheduled scan attributes to specify how the scan is performed while suspended and the matches that will trigger a wake event. Signed-off-by: Luciano Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 41 ++++++++++++++++ include/uapi/linux/nl80211.h | 23 +++++++++ net/wireless/core.h | 1 + net/wireless/nl80211.c | 111 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 175 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 8d04dfef32bf..05aae22e92a5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1940,6 +1940,7 @@ struct cfg80211_wowlan_tcp { * @rfkill_release: wake up when rfkill is released * @tcp: TCP connection establishment/wakeup parameters, see nl80211.h. * NULL if not configured. + * @nd_config: configuration for the scan to be used for net detect wake. */ struct cfg80211_wowlan { bool any, disconnect, magic_pkt, gtk_rekey_failure, @@ -1948,6 +1949,7 @@ struct cfg80211_wowlan { struct cfg80211_pkt_pattern *patterns; struct cfg80211_wowlan_tcp *tcp; int n_patterns; + struct cfg80211_sched_scan_request *nd_config; }; /** @@ -1979,6 +1981,35 @@ struct cfg80211_coalesce { int n_rules; }; +/** + * struct cfg80211_wowlan_nd_match - information about the match + * + * @ssid: SSID of the match that triggered the wake up + * @n_channels: Number of channels where the match occurred. This + * value may be zero if the driver can't report the channels. + * @channels: center frequencies of the channels where a match + * occurred (in MHz) + */ +struct cfg80211_wowlan_nd_match { + struct cfg80211_ssid ssid; + int n_channels; + u32 channels[]; +}; + +/** + * struct cfg80211_wowlan_nd_info - net detect wake up information + * + * @n_matches: Number of match information instances provided in + * @matches. This value may be zero if the driver can't provide + * match information. + * @matches: Array of pointers to matches containing information about + * the matches that triggered the wake up. + */ +struct cfg80211_wowlan_nd_info { + int n_matches; + struct cfg80211_wowlan_nd_match *matches[]; +}; + /** * struct cfg80211_wowlan_wakeup - wakeup report * @disconnect: woke up by getting disconnected @@ -1998,6 +2029,7 @@ struct cfg80211_coalesce { * @tcp_match: TCP wakeup packet received * @tcp_connlost: TCP connection lost or failed to establish * @tcp_nomoretokens: TCP data ran out of tokens + * @net_detect: if not %NULL, woke up because of net detect */ struct cfg80211_wowlan_wakeup { bool disconnect, magic_pkt, gtk_rekey_failure, @@ -2007,6 +2039,7 @@ struct cfg80211_wowlan_wakeup { s32 pattern_idx; u32 packet_present_len, packet_len; const void *packet; + struct cfg80211_wowlan_nd_info *net_detect; }; /** @@ -2810,6 +2843,7 @@ struct ieee80211_txrx_stypes { * @WIPHY_WOWLAN_EAP_IDENTITY_REQ: supports wakeup on EAP identity request * @WIPHY_WOWLAN_4WAY_HANDSHAKE: supports wakeup on 4-way handshake failure * @WIPHY_WOWLAN_RFKILL_RELEASE: supports wakeup on RF-kill release + * @WIPHY_WOWLAN_NET_DETECT: supports wakeup on network detection */ enum wiphy_wowlan_support_flags { WIPHY_WOWLAN_ANY = BIT(0), @@ -2820,6 +2854,7 @@ enum wiphy_wowlan_support_flags { WIPHY_WOWLAN_EAP_IDENTITY_REQ = BIT(5), WIPHY_WOWLAN_4WAY_HANDSHAKE = BIT(6), WIPHY_WOWLAN_RFKILL_RELEASE = BIT(7), + WIPHY_WOWLAN_NET_DETECT = BIT(8), }; struct wiphy_wowlan_tcp_support { @@ -2838,6 +2873,11 @@ struct wiphy_wowlan_tcp_support { * @pattern_max_len: maximum length of each pattern * @pattern_min_len: minimum length of each pattern * @max_pkt_offset: maximum Rx packet offset + * @max_nd_match_sets: maximum number of matchsets for net-detect, + * similar, but not necessarily identical, to max_match_sets for + * scheduled scans. + * See &struct cfg80211_sched_scan_request.@match_sets for more + * details. * @tcp: TCP wakeup support information */ struct wiphy_wowlan_support { @@ -2846,6 +2886,7 @@ struct wiphy_wowlan_support { int pattern_max_len; int pattern_min_len; int max_pkt_offset; + int max_nd_match_sets; const struct wiphy_wowlan_tcp_support *tcp; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 365db67ca71d..d23208194e3c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1686,6 +1686,7 @@ enum nl80211_commands { * * @NL80211_ATTR_OPER_CLASS: operating class * + * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use */ @@ -2043,6 +2044,7 @@ enum nl80211_attrs { /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, + NUM_NL80211_ATTR = __NL80211_ATTR_AFTER_LAST, NL80211_ATTR_MAX = __NL80211_ATTR_AFTER_LAST - 1 }; @@ -3610,6 +3612,25 @@ struct nl80211_pattern_support { * @NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS: For wakeup reporting only, * the TCP connection ran out of tokens to use for data to send to the * service + * @NL80211_WOWLAN_TRIG_NET_DETECT: wake up when a configured network + * is detected. This is a nested attribute that contains the + * same attributes used with @NL80211_CMD_START_SCHED_SCAN. It + * specifies how the scan is performed (e.g. the interval and the + * channels to scan) as well as the scan results that will + * trigger a wake (i.e. the matchsets). + * @NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS: nested attribute + * containing an array with information about what triggered the + * wake up. If no elements are present in the array, it means + * that the information is not available. If more than one + * element is present, it means that more than one match + * occurred. + * Each element in the array is a nested attribute that contains + * one optional %NL80211_ATTR_SSID attribute and one optional + * %NL80211_ATTR_SCAN_FREQUENCIES attribute. At least one of + * these attributes must be present. If + * %NL80211_ATTR_SCAN_FREQUENCIES contains more than one + * frequency, it means that the match occurred in more than one + * channel. * @NUM_NL80211_WOWLAN_TRIG: number of wake on wireless triggers * @MAX_NL80211_WOWLAN_TRIG: highest wowlan trigger attribute number * @@ -3635,6 +3656,8 @@ enum nl80211_wowlan_triggers { NL80211_WOWLAN_TRIG_WAKEUP_TCP_MATCH, NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST, NL80211_WOWLAN_TRIG_WAKEUP_TCP_NOMORETOKENS, + NL80211_WOWLAN_TRIG_NET_DETECT, + NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS, /* keep last */ NUM_NL80211_WOWLAN_TRIG, diff --git a/net/wireless/core.h b/net/wireless/core.h index 61ee664cf2bd..faa5b1609aae 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -111,6 +111,7 @@ cfg80211_rdev_free_wowlan(struct cfg80211_registered_device *rdev) rdev->wiphy.wowlan_config->tcp->sock) sock_release(rdev->wiphy.wowlan_config->tcp->sock); kfree(rdev->wiphy.wowlan_config->tcp); + kfree(rdev->wiphy.wowlan_config->nd_config); kfree(rdev->wiphy.wowlan_config); #endif } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03a302b884fd..3ec7dc557960 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -209,7 +209,7 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info) } /* policy for the attributes */ -static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, .len = 20-1 }, @@ -428,6 +428,7 @@ nl80211_wowlan_policy[NUM_NL80211_WOWLAN_TRIG] = { [NL80211_WOWLAN_TRIG_4WAY_HANDSHAKE] = { .type = NLA_FLAG }, [NL80211_WOWLAN_TRIG_RFKILL_RELEASE] = { .type = NLA_FLAG }, [NL80211_WOWLAN_TRIG_TCP_CONNECTION] = { .type = NLA_NESTED }, + [NL80211_WOWLAN_TRIG_NET_DETECT] = { .type = NLA_NESTED }, }; static const struct nla_policy @@ -1088,6 +1089,8 @@ static int nl80211_send_wowlan(struct sk_buff *msg, if (large && nl80211_send_wowlan_tcp_caps(rdev, msg)) return -ENOBUFS; + /* TODO: send wowlan net detect */ + nla_nest_end(msg, nl_wowlan); return 0; @@ -8695,6 +8698,39 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, return 0; } +static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, + const struct wiphy_wowlan_support *wowlan, + struct nlattr *attr, + struct cfg80211_wowlan *trig) +{ + struct nlattr **tb; + int err; + + tb = kzalloc(NUM_NL80211_ATTR * sizeof(*tb), GFP_KERNEL); + if (!tb) + return -ENOMEM; + + if (!(wowlan->flags & WIPHY_WOWLAN_NET_DETECT)) { + err = -EOPNOTSUPP; + goto out; + } + + err = nla_parse(tb, NL80211_ATTR_MAX, + nla_data(attr), nla_len(attr), + nl80211_policy); + if (err) + goto out; + + trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, tb); + err = PTR_ERR_OR_ZERO(trig->nd_config); + if (err) + trig->nd_config = NULL; + +out: + kfree(tb); + return err; +} + static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -8840,6 +8876,14 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) goto error; } + if (tb[NL80211_WOWLAN_TRIG_NET_DETECT]) { + err = nl80211_parse_wowlan_nd( + rdev, wowlan, tb[NL80211_WOWLAN_TRIG_NET_DETECT], + &new_triggers); + if (err) + goto error; + } + ntrig = kmemdup(&new_triggers, sizeof(new_triggers), GFP_KERNEL); if (!ntrig) { err = -ENOMEM; @@ -12082,6 +12126,67 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, EXPORT_SYMBOL(cfg80211_report_obss_beacon); #ifdef CONFIG_PM +static int cfg80211_net_detect_results(struct sk_buff *msg, + struct cfg80211_wowlan_wakeup *wakeup) +{ + struct cfg80211_wowlan_nd_info *nd = wakeup->net_detect; + struct nlattr *nl_results, *nl_match, *nl_freqs; + int i, j; + + nl_results = nla_nest_start( + msg, NL80211_WOWLAN_TRIG_NET_DETECT_RESULTS); + if (!nl_results) + return -EMSGSIZE; + + for (i = 0; i < nd->n_matches; i++) { + struct cfg80211_wowlan_nd_match *match = nd->matches[i]; + + nl_match = nla_nest_start(msg, i); + if (!nl_match) + break; + + /* The SSID attribute is optional in nl80211, but for + * simplicity reasons it's always present in the + * cfg80211 structure. If a driver can't pass the + * SSID, that needs to be changed. A zero length SSID + * is still a valid SSID (wildcard), so it cannot be + * used for this purpose. + */ + if (nla_put(msg, NL80211_ATTR_SSID, match->ssid.ssid_len, + match->ssid.ssid)) { + nla_nest_cancel(msg, nl_match); + goto out; + } + + if (match->n_channels) { + nl_freqs = nla_nest_start( + msg, NL80211_ATTR_SCAN_FREQUENCIES); + if (!nl_freqs) { + nla_nest_cancel(msg, nl_match); + goto out; + } + + for (j = 0; j < match->n_channels; j++) { + if (nla_put_u32(msg, + NL80211_ATTR_WIPHY_FREQ, + match->channels[j])) { + nla_nest_cancel(msg, nl_freqs); + nla_nest_cancel(msg, nl_match); + goto out; + } + } + + nla_nest_end(msg, nl_freqs); + } + + nla_nest_end(msg, nl_match); + } + +out: + nla_nest_end(msg, nl_results); + return 0; +} + void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, struct cfg80211_wowlan_wakeup *wakeup, gfp_t gfp) @@ -12176,6 +12281,10 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, goto free_msg; } + if (wakeup->net_detect && + cfg80211_net_detect_results(msg, wakeup)) + goto free_msg; + nla_nest_end(msg, reasons); } -- cgit v1.2.3 From ad2b26abc157460ca6fac1a53a2bfeade283adfa Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Jun 2014 21:39:05 +0200 Subject: cfg80211: allow drivers to support random MAC addresses for scan Add the necessary feature flags and a scan flag to support using random MAC addresses for scan while unassociated. The configuration for this supports an arbitrary MAC address value and mask, so that any kind of configuration (e.g. fixed OUI or full 46-bit random) can be requested. Full 46-bit random is the default when no other configuration is passed. Also add a small helper function to use the addr/mask correctly. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 25 +++++++++++++ include/uapi/linux/nl80211.h | 29 +++++++++++++++ net/wireless/nl80211.c | 86 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 137 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 05aae22e92a5..bb748c4da5af 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1437,6 +1437,10 @@ struct cfg80211_ssid { * @aborted: (internal) scan request was notified as aborted * @notified: (internal) scan request was notified as done or aborted * @no_cck: used to send probe requests at non CCK rate in 2GHz band + * @mac_addr: MAC address used with randomisation + * @mac_addr_mask: MAC address mask used with randomisation, bits that + * are 0 in the mask should be randomised, bits that are 1 should + * be taken from the @mac_addr */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; @@ -1451,6 +1455,9 @@ struct cfg80211_scan_request { struct wireless_dev *wdev; + u8 mac_addr[ETH_ALEN] __aligned(2); + u8 mac_addr_mask[ETH_ALEN] __aligned(2); + /* internal */ struct wiphy *wiphy; unsigned long scan_start; @@ -1461,6 +1468,17 @@ struct cfg80211_scan_request { struct ieee80211_channel *channels[0]; }; +static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) +{ + int i; + + get_random_bytes(buf, ETH_ALEN); + for (i = 0; i < ETH_ALEN; i++) { + buf[i] &= ~mask[i]; + buf[i] |= addr[i] & mask[i]; + } +} + /** * struct cfg80211_match_set - sets of attributes to match * @@ -1494,6 +1512,10 @@ struct cfg80211_match_set { * @channels: channels to scan * @min_rssi_thold: for drivers only supporting a single threshold, this * contains the minimum over all matchsets + * @mac_addr: MAC address used with randomisation + * @mac_addr_mask: MAC address mask used with randomisation, bits that + * are 0 in the mask should be randomised, bits that are 1 should + * be taken from the @mac_addr */ struct cfg80211_sched_scan_request { struct cfg80211_ssid *ssids; @@ -1508,6 +1530,9 @@ struct cfg80211_sched_scan_request { int n_match_sets; s32 min_rssi_thold; + u8 mac_addr[ETH_ALEN] __aligned(2); + u8 mac_addr_mask[ETH_ALEN] __aligned(2); + /* internal */ struct wiphy *wiphy; struct net_device *dev; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d23208194e3c..a99081efc2d4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1686,6 +1686,8 @@ enum nl80211_commands { * * @NL80211_ATTR_OPER_CLASS: operating class * + * @NL80211_ATTR_MAC_MASK: MAC address mask + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2041,6 +2043,8 @@ enum nl80211_attrs { NL80211_ATTR_OPER_CLASS, + NL80211_ATTR_MAC_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4139,6 +4143,18 @@ enum nl80211_ap_sme_features { * See 'macaddr' field in the vif_params (cfg80211.h). * @NL80211_FEATURE_TDLS_CHANNEL_SWITCH: Driver supports channel switching when * operating as a TDLS peer. + * @NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR: This device/driver supports using a + * random MAC address during scan (if the device is unassociated); the + * %NL80211_SCAN_FLAG_RANDOM_ADDR flag may be set for scans and the MAC + * address mask/value will be used. + * @NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR: This device/driver supports + * using a random MAC address for every scan iteration during scheduled + * scan (while not associated), the %NL80211_SCAN_FLAG_RANDOM_ADDR may + * be set for scheduled scan and the MAC address mask/value will be used. + * @NL80211_FEATURE_ND_RANDOM_MAC_ADDR: This device/driver supports using a + * random MAC address for every scan iteration during "net detect", i.e. + * scan in unassociated WoWLAN, the %NL80211_SCAN_FLAG_RANDOM_ADDR may + * be set for scheduled scan and the MAC address mask/value will be used. */ enum nl80211_feature_flags { NL80211_FEATURE_SK_TX_STATUS = 1 << 0, @@ -4170,6 +4186,9 @@ enum nl80211_feature_flags { NL80211_FEATURE_SUPPORTS_WMM_ADMISSION = 1 << 26, NL80211_FEATURE_MAC_ON_CREATE = 1 << 27, NL80211_FEATURE_TDLS_CHANNEL_SWITCH = 1 << 28, + NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR = 1 << 29, + NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR = 1 << 30, + NL80211_FEATURE_ND_RANDOM_MAC_ADDR = 1 << 31, }; /** @@ -4218,11 +4237,21 @@ enum nl80211_connect_failed_reason { * dangerous because will destroy stations performance as a lot of frames * will be lost while scanning off-channel, therefore it must be used only * when really needed + * @NL80211_SCAN_FLAG_RANDOM_ADDR: use a random MAC address for this scan (or + * for scheduled scan: a different one for every scan iteration). When the + * flag is set, depending on device capabilities the @NL80211_ATTR_MAC and + * @NL80211_ATTR_MAC_MASK attributes may also be given in which case only + * the masked bits will be preserved from the MAC address and the remainder + * randomised. If the attributes are not given full randomisation (46 bits, + * locally administered 1, multicast 0) is assumed. + * This flag must not be requested when the feature isn't supported, check + * the nl80211 feature flags for the device. */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, NL80211_SCAN_FLAG_FLUSH = 1<<1, NL80211_SCAN_FLAG_AP = 1<<2, + NL80211_SCAN_FLAG_RANDOM_ADDR = 1<<3, }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3ec7dc557960..dd5a827f9cb0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -395,6 +395,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_USER_PRIO] = { .type = NLA_U8 }, [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, + [NL80211_ATTR_MAC_MASK] = { .len = ETH_ALEN }, }; /* policy for the key attributes */ @@ -5481,6 +5482,43 @@ static int validate_scan_freqs(struct nlattr *freqs) return n_channels; } +static int nl80211_parse_random_mac(struct nlattr **attrs, + u8 *mac_addr, u8 *mac_addr_mask) +{ + int i; + + if (!attrs[NL80211_ATTR_MAC] && !attrs[NL80211_ATTR_MAC_MASK]) { + memset(mac_addr, 0, ETH_ALEN); + memset(mac_addr_mask, 0, ETH_ALEN); + mac_addr[0] = 0x2; + mac_addr_mask[0] = 0x3; + + return 0; + } + + /* need both or none */ + if (!attrs[NL80211_ATTR_MAC] || !attrs[NL80211_ATTR_MAC_MASK]) + return -EINVAL; + + memcpy(mac_addr, nla_data(attrs[NL80211_ATTR_MAC]), ETH_ALEN); + memcpy(mac_addr_mask, nla_data(attrs[NL80211_ATTR_MAC_MASK]), ETH_ALEN); + + /* don't allow or configure an mcast address */ + if (!is_multicast_ether_addr(mac_addr_mask) || + is_multicast_ether_addr(mac_addr)) + return -EINVAL; + + /* + * allow users to pass a MAC address that has bits set outside + * of the mask, but don't bother drivers with having to deal + * with such bits + */ + for (i = 0; i < ETH_ALEN; i++) + mac_addr[i] &= mac_addr_mask[i]; + + return 0; +} + static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -5658,6 +5696,25 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) err = -EOPNOTSUPP; goto out_free; } + + if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { + if (!(wiphy->features & + NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR)) { + err = -EOPNOTSUPP; + goto out_free; + } + + if (wdev->current_bss) { + err = -EOPNOTSUPP; + goto out_free; + } + + err = nl80211_parse_random_mac(info->attrs, + request->mac_addr, + request->mac_addr_mask); + if (err) + goto out_free; + } } request->no_cck = @@ -5685,7 +5742,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } static struct cfg80211_sched_scan_request * -nl80211_parse_sched_scan(struct wiphy *wiphy, +nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct nlattr **attrs) { struct cfg80211_sched_scan_request *request; @@ -5934,6 +5991,28 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, err = -EOPNOTSUPP; goto out_free; } + + if (request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) { + u32 flg = NL80211_FEATURE_SCHED_SCAN_RANDOM_MAC_ADDR; + + if (!wdev) /* must be net-detect */ + flg = NL80211_FEATURE_ND_RANDOM_MAC_ADDR; + + if (!(wiphy->features & flg)) { + err = -EOPNOTSUPP; + goto out_free; + } + + if (wdev && wdev->current_bss) { + err = -EOPNOTSUPP; + goto out_free; + } + + err = nl80211_parse_random_mac(attrs, request->mac_addr, + request->mac_addr_mask); + if (err) + goto out_free; + } } request->interval = interval; @@ -5951,6 +6030,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; int err; if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || @@ -5960,7 +6040,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (rdev->sched_scan_req) return -EINPROGRESS; - rdev->sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, + rdev->sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev, info->attrs); err = PTR_ERR_OR_ZERO(rdev->sched_scan_req); if (err) @@ -8721,7 +8801,7 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, if (err) goto out; - trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, tb); + trig->nd_config = nl80211_parse_sched_scan(&rdev->wiphy, NULL, tb); err = PTR_ERR_OR_ZERO(trig->nd_config); if (err) trig->nd_config = NULL; -- cgit v1.2.3 From 18e5ca65e55da4cacd9deb4e934eb5429bb4b79d Mon Sep 17 00:00:00 2001 From: Jukka Rissanen Date: Thu, 13 Nov 2014 17:25:14 +0200 Subject: nl80211: Replace interface socket owner attribute with more generic one Replace NL80211_ATTR_IFACE_SOCKET_OWNER attribute with more generic NL80211_ATTR_SOCKET_OWNER that can be used with other commands that interface creation. Signed-off-by: Jukka Rissanen Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 7 ++++--- net/wireless/nl80211.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a99081efc2d4..d77524510435 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1652,9 +1652,9 @@ enum nl80211_commands { * @NL80211_ATTR_TDLS_PEER_CAPABILITY: flags for TDLS peer capabilities, u32. * As specified in the &enum nl80211_tdls_peer_capability. * - * @NL80211_ATTR_IFACE_SOCKET_OWNER: flag attribute, if set during interface + * @NL80211_ATTR_SOCKET_OWNER: Flag attribute, if set during interface * creation then the new interface will be owned by the netlink socket - * that created it and will be destroyed when the socket is closed + * that created it and will be destroyed when the socket is closed. * * @NL80211_ATTR_TDLS_INITIATOR: flag attribute indicating the current end is * the TDLS link initiator. @@ -2024,7 +2024,7 @@ enum nl80211_attrs { NL80211_ATTR_TDLS_PEER_CAPABILITY, - NL80211_ATTR_IFACE_SOCKET_OWNER, + NL80211_ATTR_SOCKET_OWNER, NL80211_ATTR_CSA_C_OFFSETS_TX, NL80211_ATTR_MAX_CSA_COUNTERS, @@ -2055,6 +2055,7 @@ enum nl80211_attrs { /* source-level API compatibility */ #define NL80211_ATTR_SCAN_GENERATION NL80211_ATTR_GENERATION #define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG +#define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER /* * Allow user space programs to use #ifdef on new attributes by defining them diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5cfd75dfff67..c81491b1f737 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -388,7 +388,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MAC_HINT] = { .len = ETH_ALEN }, [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 }, [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, - [NL80211_ATTR_IFACE_SOCKET_OWNER] = { .type = NLA_FLAG }, + [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG }, [NL80211_ATTR_CSA_C_OFFSETS_TX] = { .type = NLA_BINARY }, [NL80211_ATTR_USE_RRM] = { .type = NLA_FLAG }, [NL80211_ATTR_TSID] = { .type = NLA_U8 }, @@ -2653,7 +2653,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(wdev); } - if (info->attrs[NL80211_ATTR_IFACE_SOCKET_OWNER]) + if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) wdev->owner_nlportid = info->snd_portid; switch (type) { -- cgit v1.2.3 From 2ab4bd8ea3a6954bc79a9bbeb291cd6c2d6213a7 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Fri, 17 Oct 2014 11:05:50 -0500 Subject: dlm: adopt orphan locks A process may exit, leaving an orphan lock in the lockspace. This adds the capability for another process to acquire the orphan lock. Acquiring the orphan just moves the lock from the orphan list onto the acquiring process's list of locks. An adopting process must specify the resource name and mode of the lock it wants to adopt. If a matching lock is found, the lock is moved to the caller's 's list of locks, and the lkid of the lock is returned like the lkid of a new lock. If an orphan with a different mode is found, then -EAGAIN is returned. If no orphan lock is found on the resource, then -ENOENT is returned. No async completion is used because the result is immediately available. Also, when orphans are purged, allow a zero nodeid to refer to the local nodeid so the caller does not need to look up the local nodeid. Signed-off-by: David Teigland --- fs/dlm/lock.c | 76 +++++++++++++++++++++++++++++++++++++-- fs/dlm/lock.h | 3 ++ fs/dlm/user.c | 13 +++++-- include/uapi/linux/dlmconstants.h | 2 +- 4 files changed, 89 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 83f3d5520307..35502d4046f5 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5886,6 +5886,78 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, return error; } +/* + * The caller asks for an orphan lock on a given resource with a given mode. + * If a matching lock exists, it's moved to the owner's list of locks and + * the lkid is returned. + */ + +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs, uint32_t *lkid) +{ + struct dlm_lkb *lkb; + struct dlm_user_args *ua; + int found_other_mode = 0; + int found = 0; + int rv = 0; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) { + if (lkb->lkb_resource->res_length != namelen) + continue; + if (memcmp(lkb->lkb_resource->res_name, name, namelen)) + continue; + if (lkb->lkb_grmode != mode) { + found_other_mode = 1; + continue; + } + + found = 1; + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags &= ~DLM_IFL_ORPHAN; + *lkid = lkb->lkb_id; + break; + } + mutex_unlock(&ls->ls_orphans_mutex); + + if (!found && found_other_mode) { + rv = -EAGAIN; + goto out; + } + + if (!found) { + rv = -ENOENT; + goto out; + } + + lkb->lkb_exflags = flags; + lkb->lkb_ownpid = (int) current->pid; + + ua = lkb->lkb_ua; + + ua->proc = ua_tmp->proc; + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + /* + * The lkb reference from the ls_orphans list was not + * removed above, and is now considered the reference + * for the proc locks list. + */ + + spin_lock(&ua->proc->locks_spin); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + out: + kfree(ua_tmp); + return rv; +} + int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in) { @@ -6029,7 +6101,7 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) struct dlm_args args; int error; - hold_lkb(lkb); + hold_lkb(lkb); /* reference for the ls_orphans list */ mutex_lock(&ls->ls_orphans_mutex); list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans); mutex_unlock(&ls->ls_orphans_mutex); @@ -6217,7 +6289,7 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, { int error = 0; - if (nodeid != dlm_our_nodeid()) { + if (nodeid && (nodeid != dlm_our_nodeid())) { error = send_purge(ls, nodeid, pid); } else { dlm_lock_recovery(ls); diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 5e0c72e36a9b..ed8ebd3a8593 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -49,6 +49,9 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs, uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 142e21655eed..fb85f32e9eca 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -238,6 +238,7 @@ static int device_user_lock(struct dlm_user_proc *proc, { struct dlm_ls *ls; struct dlm_user_args *ua; + uint32_t lkid; int error = -ENOMEM; ls = dlm_find_lockspace_local(proc->lockspace); @@ -260,12 +261,20 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->bastaddr = params->bastaddr; ua->xid = params->xid; - if (params->flags & DLM_LKF_CONVERT) + if (params->flags & DLM_LKF_CONVERT) { error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); - else { + } else if (params->flags & DLM_LKF_ORPHAN) { + error = dlm_user_adopt_orphan(ls, ua, + params->mode, params->flags, + params->name, params->namelen, + (unsigned long) params->timeout, + &lkid); + if (!error) + error = lkid; + } else { error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, diff --git a/include/uapi/linux/dlmconstants.h b/include/uapi/linux/dlmconstants.h index 47bf08dc7566..2857bdc5b27b 100644 --- a/include/uapi/linux/dlmconstants.h +++ b/include/uapi/linux/dlmconstants.h @@ -114,7 +114,7 @@ * * DLM_LKF_ORPHAN * - * not yet implemented + * Acquire an orphan lock. * * DLM_LKF_ALTPR * -- cgit v1.2.3 From 3758b34193638f664177565f1692faa1bec7d9ed Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 19 Nov 2014 18:38:10 +0100 Subject: drm: s/enum_blob_list/enum_list/ in drm_property I guess for hysterical raisins this was meant to be the way to read blob properties. But that's done with the two-stage approach which uses separate blob kms object and the special-purpose get_blob ioctl. Shipping userspace seems to have never relied on this, and the kernel also never put any blob thing onto that property. And nowadays it would blow up, e.g. in drm_property_destroy. Also it makes no sense to return values in an ioctl that only returns metadata about everything. So let's ditch all the internal code for the blob list, rename the list to be unambiguous and sprinkle comments all over the place to explain this peculiar piece of api. v2: Squash in fixup from Rob to remove now unused variables. Cc: Rob Clark Signed-off-by: Daniel Vetter Reviewed-by: Rob Clark Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_crtc.c | 53 +++++++++++++++------------------------------ include/drm/drm_crtc.h | 2 +- include/uapi/drm/drm_mode.h | 2 ++ 3 files changed, 20 insertions(+), 37 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 8c550302a9ef..589a921d4313 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -3457,7 +3457,7 @@ struct drm_property *drm_property_create(struct drm_device *dev, int flags, property->flags = flags; property->num_values = num_values; - INIT_LIST_HEAD(&property->enum_blob_list); + INIT_LIST_HEAD(&property->enum_list); if (name) { strncpy(property->name, name, DRM_PROP_NAME_LEN); @@ -3679,8 +3679,8 @@ int drm_property_add_enum(struct drm_property *property, int index, (value > 63)) return -EINVAL; - if (!list_empty(&property->enum_blob_list)) { - list_for_each_entry(prop_enum, &property->enum_blob_list, head) { + if (!list_empty(&property->enum_list)) { + list_for_each_entry(prop_enum, &property->enum_list, head) { if (prop_enum->value == value) { strncpy(prop_enum->name, name, DRM_PROP_NAME_LEN); prop_enum->name[DRM_PROP_NAME_LEN-1] = '\0'; @@ -3698,7 +3698,7 @@ int drm_property_add_enum(struct drm_property *property, int index, prop_enum->value = value; property->values[index] = value; - list_add_tail(&prop_enum->head, &property->enum_blob_list); + list_add_tail(&prop_enum->head, &property->enum_list); return 0; } EXPORT_SYMBOL(drm_property_add_enum); @@ -3715,7 +3715,7 @@ void drm_property_destroy(struct drm_device *dev, struct drm_property *property) { struct drm_property_enum *prop_enum, *pt; - list_for_each_entry_safe(prop_enum, pt, &property->enum_blob_list, head) { + list_for_each_entry_safe(prop_enum, pt, &property->enum_list, head) { list_del(&prop_enum->head); kfree(prop_enum); } @@ -3839,16 +3839,12 @@ int drm_mode_getproperty_ioctl(struct drm_device *dev, struct drm_mode_get_property *out_resp = data; struct drm_property *property; int enum_count = 0; - int blob_count = 0; int value_count = 0; int ret = 0, i; int copied; struct drm_property_enum *prop_enum; struct drm_mode_property_enum __user *enum_ptr; - struct drm_property_blob *prop_blob; - uint32_t __user *blob_id_ptr; uint64_t __user *values_ptr; - uint32_t __user *blob_length_ptr; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EINVAL; @@ -3862,11 +3858,8 @@ int drm_mode_getproperty_ioctl(struct drm_device *dev, if (drm_property_type_is(property, DRM_MODE_PROP_ENUM) || drm_property_type_is(property, DRM_MODE_PROP_BITMASK)) { - list_for_each_entry(prop_enum, &property->enum_blob_list, head) + list_for_each_entry(prop_enum, &property->enum_list, head) enum_count++; - } else if (drm_property_type_is(property, DRM_MODE_PROP_BLOB)) { - list_for_each_entry(prop_blob, &property->enum_blob_list, head) - blob_count++; } value_count = property->num_values; @@ -3891,7 +3884,7 @@ int drm_mode_getproperty_ioctl(struct drm_device *dev, if ((out_resp->count_enum_blobs >= enum_count) && enum_count) { copied = 0; enum_ptr = (struct drm_mode_property_enum __user *)(unsigned long)out_resp->enum_blob_ptr; - list_for_each_entry(prop_enum, &property->enum_blob_list, head) { + list_for_each_entry(prop_enum, &property->enum_list, head) { if (copy_to_user(&enum_ptr[copied].value, &prop_enum->value, sizeof(uint64_t))) { ret = -EFAULT; @@ -3909,28 +3902,16 @@ int drm_mode_getproperty_ioctl(struct drm_device *dev, out_resp->count_enum_blobs = enum_count; } - if (drm_property_type_is(property, DRM_MODE_PROP_BLOB)) { - if ((out_resp->count_enum_blobs >= blob_count) && blob_count) { - copied = 0; - blob_id_ptr = (uint32_t __user *)(unsigned long)out_resp->enum_blob_ptr; - blob_length_ptr = (uint32_t __user *)(unsigned long)out_resp->values_ptr; - - list_for_each_entry(prop_blob, &property->enum_blob_list, head) { - if (put_user(prop_blob->base.id, blob_id_ptr + copied)) { - ret = -EFAULT; - goto done; - } - - if (put_user(prop_blob->length, blob_length_ptr + copied)) { - ret = -EFAULT; - goto done; - } - - copied++; - } - } - out_resp->count_enum_blobs = blob_count; - } + /* + * NOTE: The idea seems to have been to use this to read all the blob + * property values. But nothing ever added them to the corresponding + * list, userspace always used the special-purpose get_blob ioctl to + * read the value for a blob property. It also doesn't make a lot of + * sense to return values here when everything else is just metadata for + * the property itself. + */ + if (drm_property_type_is(property, DRM_MODE_PROP_BLOB)) + out_resp->count_enum_blobs = 0; done: drm_modeset_unlock_all(dev); return ret; diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index f3142c64e3d5..b459e8fbbc25 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -216,7 +216,7 @@ struct drm_property { uint64_t *values; struct drm_device *dev; - struct list_head enum_blob_list; + struct list_head enum_list; }; struct drm_crtc; diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index a0db2d4aa5f0..86574b0005ff 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -286,6 +286,8 @@ struct drm_mode_get_property { char name[DRM_PROP_NAME_LEN]; __u32 count_values; + /* This is only used to count enum values, not blobs. The _blobs is + * simply because of a historical reason, i.e. backwards compat. */ __u32 count_enum_blobs; }; -- cgit v1.2.3 From 2fc9f6baa24ac230166df41ed636224969523341 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 13 Oct 2014 12:42:12 +0800 Subject: Btrfs: return failure if btrfs_dev_replace_finishing() failed device replace could fail due to another running scrub process or any other errors btrfs_scrub_dev() may hit, but this failure doesn't get returned to userspace. The following steps could reproduce this issue mkfs -t btrfs -f /dev/sdb1 /dev/sdb2 mount /dev/sdb1 /mnt/btrfs while true; do btrfs scrub start -B /mnt/btrfs >/dev/null 2>&1; done & btrfs replace start -Bf /dev/sdb2 /dev/sdb3 /mnt/btrfs # if this replace succeeded, do the following and repeat until # you see this log in dmesg # BTRFS: btrfs_scrub_dev(/dev/sdb2, 2, /dev/sdb3) failed -115 #btrfs replace start -Bf /dev/sdb3 /dev/sdb2 /mnt/btrfs # once you see the error log in dmesg, check return value of # replace echo $? Introduce a new dev replace result BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS to catch -EINPROGRESS explicitly and return other errors directly to userspace. Signed-off-by: Eryu Guan Signed-off-by: Chris Mason --- fs/btrfs/dev-replace.c | 12 +++++++++--- include/uapi/linux/btrfs.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6f662b34ba0e..971c061eb1ce 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -422,9 +422,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); - WARN_ON(ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } - return 0; + return ret; leave: dev_replace->srcdev = NULL; @@ -542,7 +548,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; + return scrub_ret; } printk_in_rcu(KERN_INFO diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 2f47824e7a36..611e1c5893b4 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -157,6 +157,7 @@ struct btrfs_ioctl_dev_replace_status_params { #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 +#define BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS 3 struct btrfs_ioctl_dev_replace_args { __u64 cmd; /* in */ __u64 result; /* out */ -- cgit v1.2.3 From d42472ecffd7c42086c6e5b1335c99a3adf58a09 Mon Sep 17 00:00:00 2001 From: Jussi Laako Date: Fri, 21 Nov 2014 16:04:46 +0200 Subject: ALSA: pcm: Add big-endian DSD sample formats and fix XMOS DSD sample format This patch fixes XMOS DSD sample format to DSD_U32_BE and also adds DSD_U16_BE and DSD_U32_BE sample formats. Signed-off-by: Jussi Laako Acked-by: Jurgen Kramer Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 2 ++ include/uapi/sound/asound.h | 4 +++- sound/core/pcm.c | 2 ++ sound/core/pcm_misc.c | 8 ++++++++ sound/usb/quirks.c | 4 ++-- 5 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index e862497f7556..8bb00a27e219 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -184,6 +184,8 @@ struct snd_pcm_ops { #define SNDRV_PCM_FMTBIT_DSD_U8 _SNDRV_PCM_FMTBIT(DSD_U8) #define SNDRV_PCM_FMTBIT_DSD_U16_LE _SNDRV_PCM_FMTBIT(DSD_U16_LE) #define SNDRV_PCM_FMTBIT_DSD_U32_LE _SNDRV_PCM_FMTBIT(DSD_U32_LE) +#define SNDRV_PCM_FMTBIT_DSD_U16_BE _SNDRV_PCM_FMTBIT(DSD_U16_BE) +#define SNDRV_PCM_FMTBIT_DSD_U32_BE _SNDRV_PCM_FMTBIT(DSD_U32_BE) #ifdef SNDRV_LITTLE_ENDIAN #define SNDRV_PCM_FMTBIT_S16 SNDRV_PCM_FMTBIT_S16_LE diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 6ee586728df9..941d32f007dc 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -220,7 +220,9 @@ typedef int __bitwise snd_pcm_format_t; #define SNDRV_PCM_FORMAT_DSD_U8 ((__force snd_pcm_format_t) 48) /* DSD, 1-byte samples DSD (x8) */ #define SNDRV_PCM_FORMAT_DSD_U16_LE ((__force snd_pcm_format_t) 49) /* DSD, 2-byte samples DSD (x16), little endian */ #define SNDRV_PCM_FORMAT_DSD_U32_LE ((__force snd_pcm_format_t) 50) /* DSD, 4-byte samples DSD (x32), little endian */ -#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_LE +#define SNDRV_PCM_FORMAT_DSD_U16_BE ((__force snd_pcm_format_t) 51) /* DSD, 2-byte samples DSD (x16), big endian */ +#define SNDRV_PCM_FORMAT_DSD_U32_BE ((__force snd_pcm_format_t) 52) /* DSD, 4-byte samples DSD (x32), big endian */ +#define SNDRV_PCM_FORMAT_LAST SNDRV_PCM_FORMAT_DSD_U32_BE #ifdef SNDRV_LITTLE_ENDIAN #define SNDRV_PCM_FORMAT_S16 SNDRV_PCM_FORMAT_S16_LE diff --git a/sound/core/pcm.c b/sound/core/pcm.c index 42ded997b223..c6ff94ab1ad6 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -216,6 +216,8 @@ static char *snd_pcm_format_names[] = { FORMAT(DSD_U8), FORMAT(DSD_U16_LE), FORMAT(DSD_U32_LE), + FORMAT(DSD_U16_BE), + FORMAT(DSD_U32_BE), }; const char *snd_pcm_format_name(snd_pcm_format_t format) diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c index ae7a0feb3b76..ebe8444de6c6 100644 --- a/sound/core/pcm_misc.c +++ b/sound/core/pcm_misc.c @@ -152,6 +152,14 @@ static struct pcm_format_data pcm_formats[(INT)SNDRV_PCM_FORMAT_LAST+1] = { .width = 32, .phys = 32, .le = 1, .signd = 0, .silence = { 0x69, 0x69, 0x69, 0x69 }, }, + [SNDRV_PCM_FORMAT_DSD_U16_BE] = { + .width = 16, .phys = 16, .le = 0, .signd = 0, + .silence = { 0x69, 0x69 }, + }, + [SNDRV_PCM_FORMAT_DSD_U32_BE] = { + .width = 32, .phys = 32, .le = 0, .signd = 0, + .silence = { 0x69, 0x69, 0x69, 0x69 }, + }, /* FIXME: the following three formats are not defined properly yet */ [SNDRV_PCM_FORMAT_MPEG] = { .le = -1, .signd = -1, diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a5941f80fc5b..60dfe0d28771 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1193,12 +1193,12 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, /* iFi Audio micro/nano iDSD */ case USB_ID(0x20b1, 0x3008): if (fp->altsetting == 2) - return SNDRV_PCM_FMTBIT_DSD_U32_LE; + return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; /* DIYINHK DSD DXD 384kHz USB to I2S/DSD */ case USB_ID(0x20b1, 0x2009): if (fp->altsetting == 3) - return SNDRV_PCM_FMTBIT_DSD_U32_LE; + return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; default: break; -- cgit v1.2.3 From 6b397158d07f885154b871a15f879d25b3de7579 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 20 Nov 2014 14:43:18 +0100 Subject: kvm: remove IA64 ioctls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM ia64 is no longer present so new applications shouldn't use them. The main problem is that they most likely didn't work even before, because of a conflict in the #defines: #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) #define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) The argument to KVM_SET_GUEST_DEBUG is: struct kvm_guest_debug { __u32 control; __u32 pad; struct kvm_guest_debug_arch arch; }; struct kvm_guest_debug_arch { }; meaning that sizeof(struct kvm_guest_debug) == sizeof(void *) == 8 and KVM_SET_GUEST_DEBUG == KVM_IA64_VCPU_SET_STACK. KVM_SET_GUEST_DEBUG is handled in virt/kvm/kvm_main.c before even calling kvm_arch_vcpu_ioctl (which would have handled KVM_IA64_VCPU_SET_STACK), so KVM_IA64_VCPU_SET_STACK would just return -EINVAL. Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- include/uapi/linux/kvm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6d59e5b39c9c..a37fd1224f36 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1099,9 +1099,6 @@ struct kvm_s390_ucas_mapping { #define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) #define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) #define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) -/* IA64 stack access */ -#define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) -#define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) /* Available with KVM_CAP_VCPU_EVENTS */ #define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) #define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) -- cgit v1.2.3 From c7e2b9689ef81362a8091592da6cb6a7723f377a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 19 Nov 2014 14:05:03 +0100 Subject: sched: introduce vlan action This tc action allows to work with vlan tagged skbs. Two supported sub-actions are header pop and header push. Signed-off-by: Jiri Pirko Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_vlan.h | 27 +++++ include/uapi/linux/tc_act/tc_vlan.h | 35 ++++++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/act_vlan.c | 207 ++++++++++++++++++++++++++++++++++++ 5 files changed, 281 insertions(+) create mode 100644 include/net/tc_act/tc_vlan.h create mode 100644 include/uapi/linux/tc_act/tc_vlan.h create mode 100644 net/sched/act_vlan.c (limited to 'include/uapi') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h new file mode 100644 index 000000000000..c809c1d2cea5 --- /dev/null +++ b/include/net/tc_act/tc_vlan.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2014 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __NET_TC_VLAN_H +#define __NET_TC_VLAN_H + +#include + +#define VLAN_F_POP 0x1 +#define VLAN_F_PUSH 0x2 + +struct tcf_vlan { + struct tcf_common common; + int tcfv_action; + __be16 tcfv_push_vid; + __be16 tcfv_push_proto; +}; +#define to_vlan(a) \ + container_of(a->priv, struct tcf_vlan, common) + +#endif /* __NET_TC_VLAN_H */ diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h new file mode 100644 index 000000000000..f7b8d448b960 --- /dev/null +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_TC_VLAN_H +#define __LINUX_TC_VLAN_H + +#include + +#define TCA_ACT_VLAN 12 + +#define TCA_VLAN_ACT_POP 1 +#define TCA_VLAN_ACT_PUSH 2 + +struct tc_vlan { + tc_gen; + int v_action; +}; + +enum { + TCA_VLAN_UNSPEC, + TCA_VLAN_TM, + TCA_VLAN_PARMS, + TCA_VLAN_PUSH_VLAN_ID, + TCA_VLAN_PUSH_VLAN_PROTOCOL, + __TCA_VLAN_MAX, +}; +#define TCA_VLAN_MAX (__TCA_VLAN_MAX - 1) + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a1a8e29e5fc9..88618f8b794c 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -686,6 +686,17 @@ config NET_ACT_CSUM To compile this code as a module, choose M here: the module will be called act_csum. +config NET_ACT_VLAN + tristate "Vlan manipulation" + depends on NET_CLS_ACT + ---help--- + Say Y here to push or pop vlan headers. + + If unsure, say N. + + To compile this code as a module, choose M here: the + module will be called act_vlan. + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index 0a869a11f3e6..679f24ae7f93 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o +obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c new file mode 100644 index 000000000000..d735ecf0b1a7 --- /dev/null +++ b/net/sched/act_vlan.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2014 Jiri Pirko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define VLAN_TAB_MASK 15 + +static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_vlan *v = a->priv; + int action; + int err; + + spin_lock(&v->tcf_lock); + v->tcf_tm.lastuse = jiffies; + bstats_update(&v->tcf_bstats, skb); + action = v->tcf_action; + + switch (v->tcfv_action) { + case TCA_VLAN_ACT_POP: + err = skb_vlan_pop(skb); + if (err) + goto drop; + break; + case TCA_VLAN_ACT_PUSH: + err = skb_vlan_push(skb, v->tcfv_push_proto, v->tcfv_push_vid); + if (err) + goto drop; + break; + default: + BUG(); + } + + goto unlock; + +drop: + action = TC_ACT_SHOT; + v->tcf_qstats.drops++; +unlock: + spin_unlock(&v->tcf_lock); + return action; +} + +static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { + [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, + [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, + [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, +}; + +static int tcf_vlan_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct nlattr *tb[TCA_VLAN_MAX + 1]; + struct tc_vlan *parm; + struct tcf_vlan *v; + int action; + __be16 push_vid = 0; + __be16 push_proto = 0; + int ret = 0; + int err; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy); + if (err < 0) + return err; + + if (!tb[TCA_VLAN_PARMS]) + return -EINVAL; + parm = nla_data(tb[TCA_VLAN_PARMS]); + switch (parm->v_action) { + case TCA_VLAN_ACT_POP: + break; + case TCA_VLAN_ACT_PUSH: + if (!tb[TCA_VLAN_PUSH_VLAN_ID]) + return -EINVAL; + push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); + if (push_vid >= VLAN_VID_MASK) + return -ERANGE; + + if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) { + push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]); + switch (push_proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + break; + default: + return -EPROTONOSUPPORT; + } + } else { + push_proto = htons(ETH_P_8021Q); + } + break; + default: + return -EINVAL; + } + action = parm->v_action; + + if (!tcf_hash_check(parm->index, a, bind)) { + ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind); + if (ret) + return ret; + + ret = ACT_P_CREATED; + } else { + if (bind) + return 0; + tcf_hash_release(a, bind); + if (!ovr) + return -EEXIST; + } + + v = to_vlan(a); + + spin_lock_bh(&v->tcf_lock); + + v->tcfv_action = action; + v->tcfv_push_vid = push_vid; + v->tcfv_push_proto = push_proto; + + v->tcf_action = parm->action; + + spin_unlock_bh(&v->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(a); + return ret; +} + +static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_vlan *v = a->priv; + struct tc_vlan opt = { + .index = v->tcf_index, + .refcnt = v->tcf_refcnt - ref, + .bindcnt = v->tcf_bindcnt - bind, + .action = v->tcf_action, + .v_action = v->tcfv_action, + }; + struct tcf_t t; + + if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (v->tcfv_action == TCA_VLAN_ACT_PUSH && + (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, v->tcfv_push_vid) || + nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, v->tcfv_push_proto))) + goto nla_put_failure; + + t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(v->tcf_tm.expires); + if (nla_put(skb, TCA_VLAN_TM, sizeof(t), &t)) + goto nla_put_failure; + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_vlan_ops = { + .kind = "vlan", + .type = TCA_ACT_VLAN, + .owner = THIS_MODULE, + .act = tcf_vlan, + .dump = tcf_vlan_dump, + .init = tcf_vlan_init, +}; + +static int __init vlan_init_module(void) +{ + return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK); +} + +static void __exit vlan_cleanup_module(void) +{ + tcf_unregister_action(&act_vlan_ops); +} + +module_init(vlan_init_module); +module_exit(vlan_cleanup_module); + +MODULE_AUTHOR("Jiri Pirko "); +MODULE_DESCRIPTION("vlan manipulation actions"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 0655f6a8635b1b66f2434d5556b1044c14b1ccaf Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:07 +0100 Subject: tipc: add bearer disable/enable to new netlink api A new netlink API for tipc that can disable or enable a tipc bearer. The new API is separated from the old API because of a bug in the user space client (tipc-config). The problem is that older versions of tipc-config has a very low receive limit and adding commands to the legacy genl_opts struct causes the ctrl_getfamily() response message to grow, subsequently breaking the tool. The new API utilizes netlink policies for input validation. Where the top-level netlink attributes are tipc-logical entities, like bearer. The top level entities then contain nested attributes. In this case a name, nested link properties and a domain. Netlink commands implemented in this patch: TIPC_NL_BEARER_ENABLE TIPC_NL_BEARER_DISABLE Netlink logical layout of bearer enable message: -> bearer -> name [ -> domain ] [ -> properties -> priority ] Netlink logical layout of bearer disable message: -> bearer -> name Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 83 ++++++++++++++++++++++++++++++++ net/tipc/bearer.c | 99 ++++++++++++++++++++++++++++++++++++++- net/tipc/bearer.h | 7 ++- net/tipc/core.h | 1 + net/tipc/link.c | 47 +++++++++++++++++++ net/tipc/link.h | 3 ++ net/tipc/netlink.c | 42 ++++++++++++++++- net/tipc/netlink.h | 41 ++++++++++++++++ 8 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 include/uapi/linux/tipc_netlink.h create mode 100644 net/tipc/netlink.h (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h new file mode 100644 index 000000000000..b9b710faa229 --- /dev/null +++ b/include/uapi/linux/tipc_netlink.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2014, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LINUX_TIPC_NETLINK_H_ +#define _LINUX_TIPC_NETLINK_H_ + +#define TIPC_GENL_V2_NAME "TIPCv2" +#define TIPC_GENL_V2_VERSION 0x1 + +/* Netlink commands */ +enum { + TIPC_NL_UNSPEC, + TIPC_NL_LEGACY, + TIPC_NL_BEARER_DISABLE, + TIPC_NL_BEARER_ENABLE, + + __TIPC_NL_CMD_MAX, + TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 +}; + +/* Top level netlink attributes */ +enum { + TIPC_NLA_UNSPEC, + TIPC_NLA_BEARER, /* nest */ + + __TIPC_NLA_MAX, + TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 +}; + +/* Bearer info */ +enum { + TIPC_NLA_BEARER_UNSPEC, + TIPC_NLA_BEARER_NAME, /* string */ + TIPC_NLA_BEARER_PROP, /* nest */ + TIPC_NLA_BEARER_DOMAIN, /* u32 */ + + __TIPC_NLA_BEARER_MAX, + TIPC_NLA_BEARER_MAX = __TIPC_NLA_BEARER_MAX - 1 +}; + +/* Nest, link propreties. Valid for link, media and bearer */ +enum { + TIPC_NLA_PROP_UNSPEC, + + TIPC_NLA_PROP_PRIO, /* u32 */ + TIPC_NLA_PROP_TOL, /* u32 */ + TIPC_NLA_PROP_WIN, /* u32 */ + + __TIPC_NLA_PROP_MAX, + TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 +}; + +#endif diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 264474394f9f..59815be0ab98 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1,7 +1,7 @@ /* * net/tipc/bearer.c: TIPC bearer code * - * Copyright (c) 1996-2006, 2013, Ericsson AB + * Copyright (c) 1996-2006, 2013-2014, Ericsson AB * Copyright (c) 2004-2006, 2010-2013, Wind River Systems * All rights reserved. * @@ -37,6 +37,7 @@ #include "core.h" #include "config.h" #include "bearer.h" +#include "link.h" #include "discover.h" #define MAX_ADDR_STR 60 @@ -49,6 +50,17 @@ static struct tipc_media * const media_info_array[] = { NULL }; +static const struct nla_policy +tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { + [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_BEARER_NAME] = { + .type = NLA_STRING, + .len = TIPC_MAX_BEARER_NAME + }, + [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } +}; + struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; static void bearer_disable(struct tipc_bearer *b_ptr, bool shutting_down); @@ -627,3 +639,88 @@ void tipc_bearer_stop(void) } } } + +int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_bearer *bearer; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + + name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + bearer = tipc_bearer_find(name); + if (!bearer) { + rtnl_unlock(); + return -EINVAL; + } + + bearer_disable(bearer, false); + rtnl_unlock(); + + return 0; +} + +int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *bearer; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + u32 domain; + u32 prio; + + prio = TIPC_MEDIA_LINK_PRI; + domain = tipc_own_addr & TIPC_CLUSTER_MASK; + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + + bearer = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + if (attrs[TIPC_NLA_BEARER_DOMAIN]) + domain = nla_get_u32(attrs[TIPC_NLA_BEARER_DOMAIN]); + + if (attrs[TIPC_NLA_BEARER_PROP]) { + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP], + props); + if (err) + return err; + + if (props[TIPC_NLA_PROP_PRIO]) + prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); + } + + rtnl_lock(); + err = tipc_enable_bearer(bearer, domain, prio); + if (err) { + rtnl_unlock(); + return err; + } + rtnl_unlock(); + + return 0; +} diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 78fccc49de23..a87e8c78d862 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -1,7 +1,7 @@ /* * net/tipc/bearer.h: Include file for TIPC bearer code * - * Copyright (c) 1996-2006, 2013, Ericsson AB + * Copyright (c) 1996-2006, 2013-2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -38,6 +38,8 @@ #define _TIPC_BEARER_H #include "bcast.h" +#include "netlink.h" +#include #define MAX_BEARERS 2 #define MAX_MEDIA 2 @@ -176,6 +178,9 @@ extern struct tipc_media eth_media_info; extern struct tipc_media ib_media_info; #endif +int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); + int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); diff --git a/net/tipc/core.h b/net/tipc/core.h index f773b148722f..b578b10feefa 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -41,6 +41,7 @@ #include #include +#include #include #include #include diff --git a/net/tipc/link.c b/net/tipc/link.c index 7cf8004577f1..e7f365012bd1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -40,6 +40,7 @@ #include "name_distr.h" #include "discover.h" #include "config.h" +#include "netlink.h" #include @@ -50,6 +51,14 @@ static const char *link_co_err = "Link changeover error, "; static const char *link_rst_msg = "Resetting link "; static const char *link_unk_evt = "Unknown link event "; +/* Properties valid for media, bearar and link */ +static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { + [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 }, + [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } +}; + /* * Out-of-range value for link session numbers */ @@ -2376,3 +2385,41 @@ static void link_print(struct tipc_link *l_ptr, const char *str) else pr_cont("\n"); } + +/* Parse and validate nested (link) properties valid for media, bearer and link + */ +int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) +{ + int err; + + err = nla_parse_nested(props, TIPC_NLA_PROP_MAX, prop, + tipc_nl_prop_policy); + if (err) + return err; + + if (props[TIPC_NLA_PROP_PRIO]) { + u32 prio; + + prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); + if (prio > TIPC_MAX_LINK_PRI) + return -EINVAL; + } + + if (props[TIPC_NLA_PROP_TOL]) { + u32 tol; + + tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL)) + return -EINVAL; + } + + if (props[TIPC_NLA_PROP_WIN]) { + u32 win; + + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + return -EINVAL; + } + + return 0; +} diff --git a/net/tipc/link.h b/net/tipc/link.h index b567a3427fda..4338294f20d4 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -37,6 +37,7 @@ #ifndef _TIPC_LINK_H #define _TIPC_LINK_H +#include #include "msg.h" #include "node.h" @@ -239,6 +240,8 @@ void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *start, u32 retransmits); +int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); + /* * Link sequence number manipulation routines (uses modulo 2**16 arithmetic) */ diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index ad844d365340..af506ae0a129 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -1,7 +1,7 @@ /* * net/tipc/netlink.c: TIPC configuration handling * - * Copyright (c) 2005-2006, Ericsson AB + * Copyright (c) 2005-2006, 2014, Ericsson AB * Copyright (c) 2005-2007, Wind River Systems * All rights reserved. * @@ -36,6 +36,7 @@ #include "core.h" #include "config.h" +#include "bearer.h" #include static int handle_cmd(struct sk_buff *skb, struct genl_info *info) @@ -68,6 +69,12 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) return 0; } +static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { + [TIPC_NLA_UNSPEC] = { .type = NLA_UNSPEC, }, + [TIPC_NLA_BEARER] = { .type = NLA_NESTED, }, +}; + +/* Legacy ASCII API */ static struct genl_family tipc_genl_family = { .id = GENL_ID_GENERATE, .name = TIPC_GENL_NAME, @@ -76,6 +83,7 @@ static struct genl_family tipc_genl_family = { .maxattr = 0, }; +/* Legacy ASCII API */ static struct genl_ops tipc_genl_ops[] = { { .cmd = TIPC_GENL_CMD, @@ -83,11 +91,42 @@ static struct genl_ops tipc_genl_ops[] = { }, }; +/* Users of the legacy API (tipc-config) can't handle that we add operations, + * so we have a separate genl handling for the new API. + */ +struct genl_family tipc_genl_v2_family = { + .id = GENL_ID_GENERATE, + .name = TIPC_GENL_V2_NAME, + .version = TIPC_GENL_V2_VERSION, + .hdrsize = 0, + .maxattr = TIPC_NLA_MAX, +}; + +static const struct genl_ops tipc_genl_v2_ops[] = { + { + .cmd = TIPC_NL_BEARER_DISABLE, + .doit = tipc_nl_bearer_disable, + .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_BEARER_ENABLE, + .doit = tipc_nl_bearer_enable, + .policy = tipc_nl_policy, + } +}; + int tipc_netlink_start(void) { int res; res = genl_register_family_with_ops(&tipc_genl_family, tipc_genl_ops); + if (res) { + pr_err("Failed to register legacy interface\n"); + return res; + } + + res = genl_register_family_with_ops(&tipc_genl_v2_family, + tipc_genl_v2_ops); if (res) { pr_err("Failed to register netlink interface\n"); return res; @@ -98,4 +137,5 @@ int tipc_netlink_start(void) void tipc_netlink_stop(void) { genl_unregister_family(&tipc_genl_family); + genl_unregister_family(&tipc_genl_v2_family); } diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h new file mode 100644 index 000000000000..e9980c0e2207 --- /dev/null +++ b/net/tipc/netlink.h @@ -0,0 +1,41 @@ +/* + * net/tipc/netlink.h: Include file for TIPC netlink code + * + * Copyright (c) 2014, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_NETLINK_H +#define _TIPC_NETLINK_H + +extern struct genl_family tipc_genl_v2_family; + +#endif -- cgit v1.2.3 From 35b9dd7607f049466a66427e58818b29aeae9ea7 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:08 +0100 Subject: tipc: add bearer get/dump to new netlink api Add TIPC_NL_BEARER_GET command to the new tipc netlink API. This command supports dumping all data about all bearers or getting all information about a specific bearer. The information about a bearer includes name, link priorities and domain. Netlink logical layout of bearer get message: -> bearer -> name Netlink logical layout of returned bearer information: -> bearer -> name -> link properties -> priority -> tolerance -> window -> domain Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/bearer.c | 125 ++++++++++++++++++++++++++++++++++++++ net/tipc/bearer.h | 2 + net/tipc/netlink.c | 6 ++ net/tipc/netlink.h | 6 ++ 5 files changed, 140 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index b9b710faa229..249ec7e9952c 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -43,6 +43,7 @@ enum { TIPC_NL_LEGACY, TIPC_NL_BEARER_DISABLE, TIPC_NL_BEARER_ENABLE, + TIPC_NL_BEARER_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 59815be0ab98..d49b8c25dd11 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -640,6 +640,131 @@ void tipc_bearer_stop(void) } } +/* Caller should hold rtnl_lock to protect the bearer */ +int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, struct tipc_bearer *bearer) +{ + void *hdr; + struct nlattr *attrs; + struct nlattr *prop; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_v2_family, + NLM_F_MULTI, TIPC_NL_BEARER_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_BEARER); + if (!attrs) + goto msg_full; + + if (nla_put_string(msg->skb, TIPC_NLA_BEARER_NAME, bearer->name)) + goto attr_msg_full; + + prop = nla_nest_start(msg->skb, TIPC_NLA_BEARER_PROP); + if (!prop) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, bearer->priority)) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + goto prop_msg_full; + + nla_nest_end(msg->skb, prop); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +prop_msg_full: + nla_nest_cancel(msg->skb, prop); +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + int i = cb->args[0]; + struct tipc_bearer *bearer; + struct tipc_nl_msg msg; + + if (i == MAX_BEARERS) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rtnl_lock(); + for (i = 0; i < MAX_BEARERS; i++) { + bearer = rtnl_dereference(bearer_list[i]); + if (!bearer) + continue; + + err = __tipc_nl_add_bearer(&msg, bearer); + if (err) + break; + } + rtnl_unlock(); + + cb->args[0] = i; + return skb->len; +} + +int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct sk_buff *rep; + struct tipc_bearer *bearer; + struct tipc_nl_msg msg; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + rtnl_lock(); + bearer = tipc_bearer_find(name); + if (!bearer) { + err = -EINVAL; + goto err_out; + } + + err = __tipc_nl_add_bearer(&msg, bearer); + if (err) + goto err_out; + rtnl_unlock(); + + return genlmsg_reply(rep, info); +err_out: + rtnl_unlock(); + nlmsg_free(rep); + + return err; +} + int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) { int err; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index a87e8c78d862..2d07e35f91fc 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -180,6 +180,8 @@ extern struct tipc_media ib_media_info; int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index af506ae0a129..0c00422512fa 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -112,6 +112,12 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_BEARER_ENABLE, .doit = tipc_nl_bearer_enable, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_BEARER_GET, + .doit = tipc_nl_bearer_get, + .dumpit = tipc_nl_bearer_dump, + .policy = tipc_nl_policy, } }; diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index e9980c0e2207..e34a3fca7d3d 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -38,4 +38,10 @@ extern struct genl_family tipc_genl_v2_family; +struct tipc_nl_msg { + struct sk_buff *skb; + u32 portid; + u32 seq; +}; + #endif -- cgit v1.2.3 From 315c00bc9f2bd17f7ad7ed8119ca49b1125af507 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:09 +0100 Subject: tipc: add bearer set to new netlink api Add TIPC_NL_BEARER_SET command to the new tipc netlink API. This command can set one or more link properties for a particular bearer. Netlink logical layout of bearer set message: -> bearer -> name -> link properties [ -> tolerance ] [ -> priority ] [ -> window ] Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/bearer.c | 49 +++++++++++++++++++++++++++++++++++++++ net/tipc/bearer.h | 1 + net/tipc/netlink.c | 5 ++++ 4 files changed, 56 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 249ec7e9952c..f446fba6a0c7 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -44,6 +44,7 @@ enum { TIPC_NL_BEARER_DISABLE, TIPC_NL_BEARER_ENABLE, TIPC_NL_BEARER_GET, + TIPC_NL_BEARER_SET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d49b8c25dd11..6d547d0b3f31 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -849,3 +849,52 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) return 0; } + +int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_bearer *b; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + + if (!info->attrs[TIPC_NLA_BEARER]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, + info->attrs[TIPC_NLA_BEARER], + tipc_nl_bearer_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_BEARER_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_BEARER_NAME]); + + rtnl_lock(); + b = tipc_bearer_find(name); + if (!b) { + rtnl_unlock(); + return -EINVAL; + } + + if (attrs[TIPC_NLA_BEARER_PROP]) { + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_BEARER_PROP], + props); + if (err) { + rtnl_unlock(); + return err; + } + + if (props[TIPC_NLA_PROP_TOL]) + b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + if (props[TIPC_NLA_PROP_PRIO]) + b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); + if (props[TIPC_NLA_PROP_WIN]) + b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + } + rtnl_unlock(); + + return 0; +} diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 2d07e35f91fc..c6cf9df31375 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -182,6 +182,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 0c00422512fa..a5291314a210 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -118,6 +118,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_bearer_get, .dumpit = tipc_nl_bearer_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_BEARER_SET, + .doit = tipc_nl_bearer_set, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 34b78a127c4fd57cf3d5c64031693d10a8e0fae1 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:10 +0100 Subject: tipc: add sock dump to new netlink api Add TIPC_NL_SOCK_GET command to the new tipc netlink API. This command supports dumping of all available sockets with their associated connection or publication(s). It could be extended to reply with a single socket if the NLM_F_DUMP isn't set. The information about a socket includes reference, address, connection information / publication information. Netlink logical layout of response message: -> socket -> reference -> address [ -> connection -> node -> socket [ -> connected flag -> type -> instance ] ] [ -> publication flag ] Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 28 +++++++++++ net/tipc/netlink.c | 7 +++ net/tipc/socket.c | 101 ++++++++++++++++++++++++++++++++++++++ net/tipc/socket.h | 2 + 4 files changed, 138 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index f446fba6a0c7..8c87e2490bc7 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -45,6 +45,7 @@ enum { TIPC_NL_BEARER_ENABLE, TIPC_NL_BEARER_GET, TIPC_NL_BEARER_SET, + TIPC_NL_SOCK_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -54,6 +55,7 @@ enum { enum { TIPC_NLA_UNSPEC, TIPC_NLA_BEARER, /* nest */ + TIPC_NLA_SOCK, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -70,6 +72,32 @@ enum { TIPC_NLA_BEARER_MAX = __TIPC_NLA_BEARER_MAX - 1 }; +/* Socket info */ +enum { + TIPC_NLA_SOCK_UNSPEC, + TIPC_NLA_SOCK_ADDR, /* u32 */ + TIPC_NLA_SOCK_REF, /* u32 */ + TIPC_NLA_SOCK_CON, /* nest */ + TIPC_NLA_SOCK_HAS_PUBL, /* flag */ + + __TIPC_NLA_SOCK_MAX, + TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1 +}; + +/* Nest, connection info */ +enum { + TIPC_NLA_CON_UNSPEC, + + TIPC_NLA_CON_FLAG, /* flag */ + TIPC_NLA_CON_NODE, /* u32 */ + TIPC_NLA_CON_SOCK, /* u32 */ + TIPC_NLA_CON_TYPE, /* u32 */ + TIPC_NLA_CON_INST, /* u32 */ + + __TIPC_NLA_CON_MAX, + TIPC_NLA_CON_MAX = __TIPC_NLA_CON_MAX - 1 +}; + /* Nest, link propreties. Valid for link, media and bearer */ enum { TIPC_NLA_PROP_UNSPEC, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index a5291314a210..951fabeec8b7 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -36,6 +36,7 @@ #include "core.h" #include "config.h" +#include "socket.h" #include "bearer.h" #include @@ -72,6 +73,7 @@ static int handle_cmd(struct sk_buff *skb, struct genl_info *info) static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_UNSPEC] = { .type = NLA_UNSPEC, }, [TIPC_NLA_BEARER] = { .type = NLA_NESTED, }, + [TIPC_NLA_SOCK] = { .type = NLA_NESTED, }, }; /* Legacy ASCII API */ @@ -123,6 +125,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_BEARER_SET, .doit = tipc_nl_bearer_set, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_SOCK_GET, + .dumpit = tipc_nl_sk_dump, + .policy = tipc_nl_policy, } }; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 591bbfa082a0..9e95c1ea5564 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2801,3 +2801,104 @@ void tipc_socket_stop(void) sock_unregister(tipc_family_ops.family); proto_unregister(&tipc_proto); } + +/* Caller should hold socket lock for the passed tipc socket. */ +int __tipc_nl_add_sk_con(struct sk_buff *skb, struct tipc_sock *tsk) +{ + u32 peer_node; + u32 peer_port; + struct nlattr *nest; + + peer_node = tsk_peer_node(tsk); + peer_port = tsk_peer_port(tsk); + + nest = nla_nest_start(skb, TIPC_NLA_SOCK_CON); + + if (nla_put_u32(skb, TIPC_NLA_CON_NODE, peer_node)) + goto msg_full; + if (nla_put_u32(skb, TIPC_NLA_CON_SOCK, peer_port)) + goto msg_full; + + if (tsk->conn_type != 0) { + if (nla_put_flag(skb, TIPC_NLA_CON_FLAG)) + goto msg_full; + if (nla_put_u32(skb, TIPC_NLA_CON_TYPE, tsk->conn_type)) + goto msg_full; + if (nla_put_u32(skb, TIPC_NLA_CON_INST, tsk->conn_instance)) + goto msg_full; + } + nla_nest_end(skb, nest); + + return 0; + +msg_full: + nla_nest_cancel(skb, nest); + + return -EMSGSIZE; +} + +/* Caller should hold socket lock for the passed tipc socket. */ +int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb, + struct tipc_sock *tsk) +{ + int err; + void *hdr; + struct nlattr *attrs; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &tipc_genl_v2_family, NLM_F_MULTI, TIPC_NL_SOCK_GET); + if (!hdr) + goto msg_cancel; + + attrs = nla_nest_start(skb, TIPC_NLA_SOCK); + if (!attrs) + goto genlmsg_cancel; + if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->ref)) + goto attr_msg_cancel; + if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tipc_own_addr)) + goto attr_msg_cancel; + + if (tsk->connected) { + err = __tipc_nl_add_sk_con(skb, tsk); + if (err) + goto attr_msg_cancel; + } else if (!list_empty(&tsk->publications)) { + if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL)) + goto attr_msg_cancel; + } + nla_nest_end(skb, attrs); + genlmsg_end(skb, hdr); + + return 0; + +attr_msg_cancel: + nla_nest_cancel(skb, attrs); +genlmsg_cancel: + genlmsg_cancel(skb, hdr); +msg_cancel: + return -EMSGSIZE; +} + +int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + struct tipc_sock *tsk; + u32 prev_ref = cb->args[0]; + u32 ref = prev_ref; + + tsk = tipc_sk_get_next(&ref); + for (; tsk; tsk = tipc_sk_get_next(&ref)) { + lock_sock(&tsk->sk); + err = __tipc_nl_add_sk(skb, cb, tsk); + release_sock(&tsk->sk); + tipc_sk_put(tsk); + if (err) + break; + + prev_ref = ref; + } + + cb->args[0] = prev_ref; + + return skb->len; +} diff --git a/net/tipc/socket.h b/net/tipc/socket.h index baa43d03901e..16dfd62983a8 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -36,6 +36,7 @@ #define _TIPC_SOCK_H #include +#include #define TIPC_CONNACK_INTV 256 #define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) @@ -47,5 +48,6 @@ void tipc_sk_mcast_rcv(struct sk_buff *buf); void tipc_sk_reinit(void); int tipc_sk_ref_table_init(u32 requested_size, u32 start); void tipc_sk_ref_table_stop(void); +int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb); #endif -- cgit v1.2.3 From 1a1a143daf84db95dd7212086042004a3abb7bc2 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:11 +0100 Subject: tipc: add publication dump to new netlink api Add TIPC_NL_PUBL_GET command to the new tipc netlink API. This command supports dumping of all publications for a specific socket. Netlink logical layout of request message: -> socket -> reference Netlink logical layout of response message: -> publication -> type -> lower -> upper Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 18 +++++ net/tipc/netlink.c | 17 +++++ net/tipc/netlink.h | 1 + net/tipc/socket.c | 135 ++++++++++++++++++++++++++++++++++++++ net/tipc/socket.h | 1 + 5 files changed, 172 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 8c87e2490bc7..7e51ff61dccd 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -46,6 +46,7 @@ enum { TIPC_NL_BEARER_GET, TIPC_NL_BEARER_SET, TIPC_NL_SOCK_GET, + TIPC_NL_PUBL_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -56,6 +57,7 @@ enum { TIPC_NLA_UNSPEC, TIPC_NLA_BEARER, /* nest */ TIPC_NLA_SOCK, /* nest */ + TIPC_NLA_PUBL, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -84,6 +86,22 @@ enum { TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1 }; +/* Publication info */ +enum { + TIPC_NLA_PUBL_UNSPEC, + + TIPC_NLA_PUBL_TYPE, /* u32 */ + TIPC_NLA_PUBL_LOWER, /* u32 */ + TIPC_NLA_PUBL_UPPER, /* u32 */ + TIPC_NLA_PUBL_SCOPE, /* u32 */ + TIPC_NLA_PUBL_NODE, /* u32 */ + TIPC_NLA_PUBL_REF, /* u32 */ + TIPC_NLA_PUBL_KEY, /* u32 */ + + __TIPC_NLA_PUBL_MAX, + TIPC_NLA_PUBL_MAX = __TIPC_NLA_PUBL_MAX - 1 +}; + /* Nest, connection info */ enum { TIPC_NLA_CON_UNSPEC, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 951fabeec8b7..9bc64aaff466 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -74,6 +74,7 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_UNSPEC] = { .type = NLA_UNSPEC, }, [TIPC_NLA_BEARER] = { .type = NLA_NESTED, }, [TIPC_NLA_SOCK] = { .type = NLA_NESTED, }, + [TIPC_NLA_PUBL] = { .type = NLA_NESTED, } }; /* Legacy ASCII API */ @@ -130,9 +131,25 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_SOCK_GET, .dumpit = tipc_nl_sk_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_PUBL_GET, + .dumpit = tipc_nl_publ_dump, + .policy = tipc_nl_policy, } }; +int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) +{ + u32 maxattr = tipc_genl_v2_family.maxattr; + + *attr = tipc_genl_v2_family.attrbuf; + if (!*attr) + return -EOPNOTSUPP; + + return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy); +} + int tipc_netlink_start(void) { int res; diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h index e34a3fca7d3d..1425c6869de0 100644 --- a/net/tipc/netlink.h +++ b/net/tipc/netlink.h @@ -37,6 +37,7 @@ #define _TIPC_NETLINK_H extern struct genl_family tipc_genl_v2_family; +int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf); struct tipc_nl_msg { struct sk_buff *skb; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 9e95c1ea5564..e91809182c28 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -121,6 +121,14 @@ static const struct proto_ops msg_ops; static struct proto tipc_proto; static struct proto tipc_proto_kern; +static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = { + [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 }, + [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED }, + [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG } +}; + /* * Revised TIPC socket locking policy: * @@ -2902,3 +2910,130 @@ int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } + +/* Caller should hold socket lock for the passed tipc socket. */ +int __tipc_nl_add_sk_publ(struct sk_buff *skb, struct netlink_callback *cb, + struct publication *publ) +{ + void *hdr; + struct nlattr *attrs; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &tipc_genl_v2_family, NLM_F_MULTI, TIPC_NL_PUBL_GET); + if (!hdr) + goto msg_cancel; + + attrs = nla_nest_start(skb, TIPC_NLA_PUBL); + if (!attrs) + goto genlmsg_cancel; + + if (nla_put_u32(skb, TIPC_NLA_PUBL_KEY, publ->key)) + goto attr_msg_cancel; + if (nla_put_u32(skb, TIPC_NLA_PUBL_TYPE, publ->type)) + goto attr_msg_cancel; + if (nla_put_u32(skb, TIPC_NLA_PUBL_LOWER, publ->lower)) + goto attr_msg_cancel; + if (nla_put_u32(skb, TIPC_NLA_PUBL_UPPER, publ->upper)) + goto attr_msg_cancel; + + nla_nest_end(skb, attrs); + genlmsg_end(skb, hdr); + + return 0; + +attr_msg_cancel: + nla_nest_cancel(skb, attrs); +genlmsg_cancel: + genlmsg_cancel(skb, hdr); +msg_cancel: + return -EMSGSIZE; +} + +/* Caller should hold socket lock for the passed tipc socket. */ +int __tipc_nl_list_sk_publ(struct sk_buff *skb, struct netlink_callback *cb, + struct tipc_sock *tsk, u32 *last_publ) +{ + int err; + struct publication *p; + + if (*last_publ) { + list_for_each_entry(p, &tsk->publications, pport_list) { + if (p->key == *last_publ) + break; + } + if (p->key != *last_publ) { + /* We never set seq or call nl_dump_check_consistent() + * this means that setting prev_seq here will cause the + * consistence check to fail in the netlink callback + * handler. Resulting in the last NLMSG_DONE message + * having the NLM_F_DUMP_INTR flag set. + */ + cb->prev_seq = 1; + *last_publ = 0; + return -EPIPE; + } + } else { + p = list_first_entry(&tsk->publications, struct publication, + pport_list); + } + + list_for_each_entry_from(p, &tsk->publications, pport_list) { + err = __tipc_nl_add_sk_publ(skb, cb, p); + if (err) { + *last_publ = p->key; + return err; + } + } + *last_publ = 0; + + return 0; +} + +int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + u32 tsk_ref = cb->args[0]; + u32 last_publ = cb->args[1]; + u32 done = cb->args[2]; + struct tipc_sock *tsk; + + if (!tsk_ref) { + struct nlattr **attrs; + struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; + + err = tipc_nlmsg_parse(cb->nlh, &attrs); + if (err) + return err; + + err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, + attrs[TIPC_NLA_SOCK], + tipc_nl_sock_policy); + if (err) + return err; + + if (!sock[TIPC_NLA_SOCK_REF]) + return -EINVAL; + + tsk_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]); + } + + if (done) + return 0; + + tsk = tipc_sk_get(tsk_ref); + if (!tsk) + return -EINVAL; + + lock_sock(&tsk->sk); + err = __tipc_nl_list_sk_publ(skb, cb, tsk, &last_publ); + if (!err) + done = 1; + release_sock(&tsk->sk); + tipc_sk_put(tsk); + + cb->args[0] = tsk_ref; + cb->args[1] = last_publ; + cb->args[2] = done; + + return skb->len; +} diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 16dfd62983a8..d34089387006 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -49,5 +49,6 @@ void tipc_sk_reinit(void); int tipc_sk_ref_table_init(u32 requested_size, u32 start); void tipc_sk_ref_table_stop(void); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb); #endif -- cgit v1.2.3 From 7be57fc6918470ecacd16b89c0d4f73d8fc265c4 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:12 +0100 Subject: tipc: add link get/dump to new netlink api Add TIPC_NL_LINK_GET command to the new tipc netlink API. This command supports dumping all information about all links (including the broadcast link) or getting all information about a specific link (not the broadcast link). The information about a link includes name, transmission info, properties and link statistics. As the tipc broadcast link is special we unfortunately have to treat it specially. It is a deliberate decision not to abstract the broadcast link on this (API) level. Netlink logical layout of link response message: -> port -> name -> MTU -> RX -> TX -> up flag -> active flag -> properties -> priority -> tolerance -> window -> statistics -> rx_info -> rx_fragments -> rx_fragmented -> rx_bundles -> rx_bundled -> tx_info -> tx_fragments -> tx_fragmented -> tx_bundles -> tx_bundled -> msg_prof_tot -> msg_len_cnt -> msg_len_tot -> msg_len_p0 -> msg_len_p1 -> msg_len_p2 -> msg_len_p3 -> msg_len_p4 -> msg_len_p5 -> msg_len_p6 -> rx_states -> rx_probes -> rx_nacks -> rx_deferred -> tx_states -> tx_probes -> tx_nacks -> tx_acks -> retransmitted -> duplicates -> link_congs -> max_queue -> avg_queue Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 62 ++++++++ net/tipc/bcast.c | 111 +++++++++++++++ net/tipc/bcast.h | 4 + net/tipc/link.c | 287 ++++++++++++++++++++++++++++++++++++++ net/tipc/link.h | 2 + net/tipc/netlink.c | 10 +- 6 files changed, 475 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 7e51ff61dccd..1034fb43d32e 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -47,6 +47,7 @@ enum { TIPC_NL_BEARER_SET, TIPC_NL_SOCK_GET, TIPC_NL_PUBL_GET, + TIPC_NL_LINK_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -58,6 +59,7 @@ enum { TIPC_NLA_BEARER, /* nest */ TIPC_NLA_SOCK, /* nest */ TIPC_NLA_PUBL, /* nest */ + TIPC_NLA_LINK, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -86,6 +88,24 @@ enum { TIPC_NLA_SOCK_MAX = __TIPC_NLA_SOCK_MAX - 1 }; +/* Link info */ +enum { + TIPC_NLA_LINK_UNSPEC, + TIPC_NLA_LINK_NAME, /* string */ + TIPC_NLA_LINK_DEST, /* u32 */ + TIPC_NLA_LINK_MTU, /* u32 */ + TIPC_NLA_LINK_BROADCAST, /* flag */ + TIPC_NLA_LINK_UP, /* flag */ + TIPC_NLA_LINK_ACTIVE, /* flag */ + TIPC_NLA_LINK_PROP, /* nest */ + TIPC_NLA_LINK_STATS, /* nest */ + TIPC_NLA_LINK_RX, /* u32 */ + TIPC_NLA_LINK_TX, /* u32 */ + + __TIPC_NLA_LINK_MAX, + TIPC_NLA_LINK_MAX = __TIPC_NLA_LINK_MAX - 1 +}; + /* Publication info */ enum { TIPC_NLA_PUBL_UNSPEC, @@ -128,4 +148,46 @@ enum { TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 }; +/* Nest, statistics info */ +enum { + TIPC_NLA_STATS_UNSPEC, + + TIPC_NLA_STATS_RX_INFO, /* u32 */ + TIPC_NLA_STATS_RX_FRAGMENTS, /* u32 */ + TIPC_NLA_STATS_RX_FRAGMENTED, /* u32 */ + TIPC_NLA_STATS_RX_BUNDLES, /* u32 */ + TIPC_NLA_STATS_RX_BUNDLED, /* u32 */ + TIPC_NLA_STATS_TX_INFO, /* u32 */ + TIPC_NLA_STATS_TX_FRAGMENTS, /* u32 */ + TIPC_NLA_STATS_TX_FRAGMENTED, /* u32 */ + TIPC_NLA_STATS_TX_BUNDLES, /* u32 */ + TIPC_NLA_STATS_TX_BUNDLED, /* u32 */ + TIPC_NLA_STATS_MSG_PROF_TOT, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_CNT, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_TOT, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P0, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P1, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P2, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P3, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P4, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P5, /* u32 */ + TIPC_NLA_STATS_MSG_LEN_P6, /* u32 */ + TIPC_NLA_STATS_RX_STATES, /* u32 */ + TIPC_NLA_STATS_RX_PROBES, /* u32 */ + TIPC_NLA_STATS_RX_NACKS, /* u32 */ + TIPC_NLA_STATS_RX_DEFERRED, /* u32 */ + TIPC_NLA_STATS_TX_STATES, /* u32 */ + TIPC_NLA_STATS_TX_PROBES, /* u32 */ + TIPC_NLA_STATS_TX_NACKS, /* u32 */ + TIPC_NLA_STATS_TX_ACKS, /* u32 */ + TIPC_NLA_STATS_RETRANSMITTED, /* u32 */ + TIPC_NLA_STATS_DUPLICATES, /* u32 */ + TIPC_NLA_STATS_LINK_CONGS, /* u32 */ + TIPC_NLA_STATS_MAX_QUEUE, /* u32 */ + TIPC_NLA_STATS_AVG_QUEUE, /* u32 */ + + __TIPC_NLA_STATS_MAX, + TIPC_NLA_STATS_MAX = __TIPC_NLA_STATS_MAX - 1 +}; + #endif diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index b8670bf262e2..dcf3589e3cc5 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -767,6 +767,117 @@ void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action) tipc_bclink_unlock(); } +int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, struct tipc_stats *stats) +{ + int i; + struct nlattr *nest; + + struct nla_map { + __u32 key; + __u32 val; + }; + + struct nla_map map[] = { + {TIPC_NLA_STATS_RX_INFO, stats->recv_info}, + {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments}, + {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented}, + {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles}, + {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled}, + {TIPC_NLA_STATS_TX_INFO, stats->sent_info}, + {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments}, + {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented}, + {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles}, + {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled}, + {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks}, + {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv}, + {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks}, + {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks}, + {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted}, + {TIPC_NLA_STATS_DUPLICATES, stats->duplicates}, + {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs}, + {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz}, + {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ? + (stats->accu_queue_sz / stats->queue_sz_counts) : 0} + }; + + nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS); + if (!nest) + return -EMSGSIZE; + + for (i = 0; i < ARRAY_SIZE(map); i++) + if (nla_put_u32(skb, map[i].key, map[i].val)) + goto msg_full; + + nla_nest_end(skb, nest); + + return 0; +msg_full: + nla_nest_cancel(skb, nest); + + return -EMSGSIZE; +} + +int tipc_nl_add_bc_link(struct tipc_nl_msg *msg) +{ + int err; + void *hdr; + struct nlattr *attrs; + struct nlattr *prop; + + if (!bcl) + return 0; + + tipc_bclink_lock(); + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_v2_family, + NLM_F_MULTI, TIPC_NL_LINK_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); + if (!attrs) + goto msg_full; + + /* The broadcast link is always up */ + if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) + goto attr_msg_full; + + if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST)) + goto attr_msg_full; + if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->next_in_no)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->next_out_no)) + goto attr_msg_full; + + prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); + if (!prop) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->queue_limit[0])) + goto prop_msg_full; + nla_nest_end(msg->skb, prop); + + err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats); + if (err) + goto attr_msg_full; + + tipc_bclink_unlock(); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +prop_msg_full: + nla_nest_cancel(msg->skb, prop); +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + tipc_bclink_unlock(); + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} int tipc_bclink_stats(char *buf, const u32 buf_size) { diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index e7b0f85a82bc..443de084d3e8 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -37,6 +37,8 @@ #ifndef _TIPC_BCAST_H #define _TIPC_BCAST_H +#include "netlink.h" + #define MAX_NODES 4096 #define WSIZE 32 #define TIPC_BCLINK_RESET 1 @@ -100,4 +102,6 @@ void tipc_bcbearer_sort(struct tipc_node_map *nm_ptr, u32 node, bool action); uint tipc_bclink_get_mtu(void); int tipc_bclink_xmit(struct sk_buff *buf); void tipc_bclink_wakeup_users(void); +int tipc_nl_add_bc_link(struct tipc_nl_msg *msg); + #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index e7f365012bd1..bdc4b5e5bf56 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -36,6 +36,7 @@ #include "core.h" #include "link.h" +#include "bcast.h" #include "socket.h" #include "name_distr.h" #include "discover.h" @@ -51,6 +52,22 @@ static const char *link_co_err = "Link changeover error, "; static const char *link_rst_msg = "Resetting link "; static const char *link_unk_evt = "Unknown link event "; +static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { + [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_LINK_NAME] = { + .type = NLA_STRING, + .len = TIPC_MAX_LINK_NAME + }, + [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG }, + [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED }, + [TIPC_NLA_LINK_RX] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_TX] = { .type = NLA_U32 } +}; + /* Properties valid for media, bearar and link */ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC }, @@ -2423,3 +2440,273 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) return 0; } + +int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s) +{ + int i; + struct nlattr *stats; + + struct nla_map { + u32 key; + u32 val; + }; + + struct nla_map map[] = { + {TIPC_NLA_STATS_RX_INFO, s->recv_info}, + {TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments}, + {TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented}, + {TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles}, + {TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled}, + {TIPC_NLA_STATS_TX_INFO, s->sent_info}, + {TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments}, + {TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented}, + {TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles}, + {TIPC_NLA_STATS_TX_BUNDLED, s->sent_bundled}, + {TIPC_NLA_STATS_MSG_PROF_TOT, (s->msg_length_counts) ? + s->msg_length_counts : 1}, + {TIPC_NLA_STATS_MSG_LEN_CNT, s->msg_length_counts}, + {TIPC_NLA_STATS_MSG_LEN_TOT, s->msg_lengths_total}, + {TIPC_NLA_STATS_MSG_LEN_P0, s->msg_length_profile[0]}, + {TIPC_NLA_STATS_MSG_LEN_P1, s->msg_length_profile[1]}, + {TIPC_NLA_STATS_MSG_LEN_P2, s->msg_length_profile[2]}, + {TIPC_NLA_STATS_MSG_LEN_P3, s->msg_length_profile[3]}, + {TIPC_NLA_STATS_MSG_LEN_P4, s->msg_length_profile[4]}, + {TIPC_NLA_STATS_MSG_LEN_P5, s->msg_length_profile[5]}, + {TIPC_NLA_STATS_MSG_LEN_P6, s->msg_length_profile[6]}, + {TIPC_NLA_STATS_RX_STATES, s->recv_states}, + {TIPC_NLA_STATS_RX_PROBES, s->recv_probes}, + {TIPC_NLA_STATS_RX_NACKS, s->recv_nacks}, + {TIPC_NLA_STATS_RX_DEFERRED, s->deferred_recv}, + {TIPC_NLA_STATS_TX_STATES, s->sent_states}, + {TIPC_NLA_STATS_TX_PROBES, s->sent_probes}, + {TIPC_NLA_STATS_TX_NACKS, s->sent_nacks}, + {TIPC_NLA_STATS_TX_ACKS, s->sent_acks}, + {TIPC_NLA_STATS_RETRANSMITTED, s->retransmitted}, + {TIPC_NLA_STATS_DUPLICATES, s->duplicates}, + {TIPC_NLA_STATS_LINK_CONGS, s->link_congs}, + {TIPC_NLA_STATS_MAX_QUEUE, s->max_queue_sz}, + {TIPC_NLA_STATS_AVG_QUEUE, s->queue_sz_counts ? + (s->accu_queue_sz / s->queue_sz_counts) : 0} + }; + + stats = nla_nest_start(skb, TIPC_NLA_LINK_STATS); + if (!stats) + return -EMSGSIZE; + + for (i = 0; i < ARRAY_SIZE(map); i++) + if (nla_put_u32(skb, map[i].key, map[i].val)) + goto msg_full; + + nla_nest_end(skb, stats); + + return 0; +msg_full: + nla_nest_cancel(skb, stats); + + return -EMSGSIZE; +} + +/* Caller should hold appropriate locks to protect the link */ +int __tipc_nl_add_link(struct tipc_nl_msg *msg, struct tipc_link *link) +{ + int err; + void *hdr; + struct nlattr *attrs; + struct nlattr *prop; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_v2_family, + NLM_F_MULTI, TIPC_NL_LINK_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK); + if (!attrs) + goto msg_full; + + if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, + tipc_cluster_mask(tipc_own_addr))) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->max_pkt)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->next_in_no)) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->next_out_no)) + goto attr_msg_full; + + if (tipc_link_is_up(link)) + if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) + goto attr_msg_full; + if (tipc_link_is_active(link)) + if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE)) + goto attr_msg_full; + + prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP); + if (!prop) + goto attr_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, link->tolerance)) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, + link->queue_limit[TIPC_LOW_IMPORTANCE])) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) + goto prop_msg_full; + nla_nest_end(msg->skb, prop); + + err = __tipc_nl_add_stats(msg->skb, &link->stats); + if (err) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +prop_msg_full: + nla_nest_cancel(msg->skb, prop); +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +/* Caller should hold node lock */ +int __tipc_nl_add_node_links(struct tipc_nl_msg *msg, struct tipc_node *node, + u32 *prev_link) +{ + u32 i; + int err; + + for (i = *prev_link; i < MAX_BEARERS; i++) { + *prev_link = i; + + if (!node->links[i]) + continue; + + err = __tipc_nl_add_link(msg, node->links[i]); + if (err) + return err; + } + *prev_link = 0; + + return 0; +} + +int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct tipc_node *node; + struct tipc_nl_msg msg; + u32 prev_node = cb->args[0]; + u32 prev_link = cb->args[1]; + int done = cb->args[2]; + int err; + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + + if (prev_node) { + node = tipc_node_find(prev_node); + if (!node) { + /* We never set seq or call nl_dump_check_consistent() + * this means that setting prev_seq here will cause the + * consistence check to fail in the netlink callback + * handler. Resulting in the last NLMSG_DONE message + * having the NLM_F_DUMP_INTR flag set. + */ + cb->prev_seq = 1; + goto out; + } + + list_for_each_entry_continue_rcu(node, &tipc_node_list, list) { + tipc_node_lock(node); + err = __tipc_nl_add_node_links(&msg, node, &prev_link); + tipc_node_unlock(node); + if (err) + goto out; + + prev_node = node->addr; + } + } else { + err = tipc_nl_add_bc_link(&msg); + if (err) + goto out; + + list_for_each_entry_rcu(node, &tipc_node_list, list) { + tipc_node_lock(node); + err = __tipc_nl_add_node_links(&msg, node, &prev_link); + tipc_node_unlock(node); + if (err) + goto out; + + prev_node = node->addr; + } + } + done = 1; +out: + rcu_read_unlock(); + + cb->args[0] = prev_node; + cb->args[1] = prev_link; + cb->args[2] = done; + + return skb->len; +} + +int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *ans_skb; + struct tipc_nl_msg msg; + struct tipc_link *link; + struct tipc_node *node; + char *name; + int bearer_id; + int err; + + if (!info->attrs[TIPC_NLA_LINK_NAME]) + return -EINVAL; + + name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]); + node = tipc_link_find_owner(name, &bearer_id); + if (!node) + return -EINVAL; + + ans_skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!ans_skb) + return -ENOMEM; + + msg.skb = ans_skb; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + tipc_node_lock(node); + link = node->links[bearer_id]; + if (!link) { + err = -EINVAL; + goto err_out; + } + + err = __tipc_nl_add_link(&msg, link); + if (err) + goto err_out; + + tipc_node_unlock(node); + + return genlmsg_reply(ans_skb, info); + +err_out: + tipc_node_unlock(node); + nlmsg_free(ans_skb); + + return err; +} diff --git a/net/tipc/link.h b/net/tipc/link.h index 4338294f20d4..8e3542a4a037 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -240,6 +240,8 @@ void tipc_link_set_queue_limits(struct tipc_link *l_ptr, u32 window); void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *start, u32 retransmits); +int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); /* diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 9bc64aaff466..ea168b889caa 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -38,6 +38,7 @@ #include "config.h" #include "socket.h" #include "bearer.h" +#include "link.h" #include static int handle_cmd(struct sk_buff *skb, struct genl_info *info) @@ -74,7 +75,8 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_UNSPEC] = { .type = NLA_UNSPEC, }, [TIPC_NLA_BEARER] = { .type = NLA_NESTED, }, [TIPC_NLA_SOCK] = { .type = NLA_NESTED, }, - [TIPC_NLA_PUBL] = { .type = NLA_NESTED, } + [TIPC_NLA_PUBL] = { .type = NLA_NESTED, }, + [TIPC_NLA_LINK] = { .type = NLA_NESTED, }, }; /* Legacy ASCII API */ @@ -136,6 +138,12 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_PUBL_GET, .dumpit = tipc_nl_publ_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_LINK_GET, + .doit = tipc_nl_link_get, + .dumpit = tipc_nl_link_dump, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From f96ce7a20d6972a834202f3cdd6a53fd0ee26a8e Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:13 +0100 Subject: tipc: add link set to new netlink api Add TIPC_NL_LINK_SET to the new tipc netlink API. This command can set one or more link properties for a particular link. Netlink logical layout of link set message: -> link -> name -> properties [ -> tolerance ] [ -> priority ] [ -> window ] Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/link.c | 73 +++++++++++++++++++++++++++++++++++++++ net/tipc/link.h | 1 + net/tipc/netlink.c | 5 +++ 4 files changed, 80 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 1034fb43d32e..2deb7fd8d85d 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -48,6 +48,7 @@ enum { TIPC_NL_SOCK_GET, TIPC_NL_PUBL_GET, TIPC_NL_LINK_GET, + TIPC_NL_LINK_SET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/link.c b/net/tipc/link.c index bdc4b5e5bf56..6785dcfd2d23 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2441,6 +2441,79 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) return 0; } +int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + int res = 0; + int bearer_id; + char *name; + struct tipc_link *link; + struct tipc_node *node; + struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; + + if (!info->attrs[TIPC_NLA_LINK]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_LINK_NAME]) + return -EINVAL; + + name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + + node = tipc_link_find_owner(name, &bearer_id); + if (!node) + return -EINVAL; + + tipc_node_lock(node); + + link = node->links[bearer_id]; + if (!link) { + res = -EINVAL; + goto out; + } + + if (attrs[TIPC_NLA_LINK_PROP]) { + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], + props); + if (err) { + res = err; + goto out; + } + + if (props[TIPC_NLA_PROP_TOL]) { + u32 tol; + + tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + link_set_supervision_props(link, tol); + tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0, 0); + } + if (props[TIPC_NLA_PROP_PRIO]) { + u32 prio; + + prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); + link->priority = prio; + tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio, 0); + } + if (props[TIPC_NLA_PROP_WIN]) { + u32 win; + + win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, win); + } + } + +out: + tipc_node_unlock(node); + + return res; +} int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s) { int i; diff --git a/net/tipc/link.h b/net/tipc/link.h index 8e3542a4a037..3738ba11ce00 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -242,6 +242,7 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); /* diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index ea168b889caa..d4444e59bf98 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -144,6 +144,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_link_get, .dumpit = tipc_nl_link_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_LINK_SET, + .doit = tipc_nl_link_set, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From ae36342b50a91cff188e417201452dc075a8f444 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:14 +0100 Subject: tipc: add link stat reset to new netlink api Add TIPC_NL_LINK_RESET_STATS command to the new netlink API. This command resets the link statistics for a particular link. Netlink logical layout of link reset message: -> link -> name Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/link.c | 49 +++++++++++++++++++++++++++++++++++++++ net/tipc/link.h | 1 + net/tipc/netlink.c | 5 ++++ 4 files changed, 56 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 2deb7fd8d85d..574882a301bf 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -49,6 +49,7 @@ enum { TIPC_NL_PUBL_GET, TIPC_NL_LINK_GET, TIPC_NL_LINK_SET, + TIPC_NL_LINK_RESET_STATS, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/link.c b/net/tipc/link.c index 6785dcfd2d23..629e8cf393bf 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2783,3 +2783,52 @@ err_out: return err; } + +int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *link_name; + unsigned int bearer_id; + struct tipc_link *link; + struct tipc_node *node; + struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; + + if (!info->attrs[TIPC_NLA_LINK]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, + info->attrs[TIPC_NLA_LINK], + tipc_nl_link_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_LINK_NAME]) + return -EINVAL; + + link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); + + if (strcmp(link_name, tipc_bclink_name) == 0) { + err = tipc_bclink_reset_stats(); + if (err) + return err; + return 0; + } + + node = tipc_link_find_owner(link_name, &bearer_id); + if (!node) + return -EINVAL; + + tipc_node_lock(node); + + link = node->links[bearer_id]; + if (!link) { + tipc_node_unlock(node); + return -EINVAL; + } + + link_reset_statistics(link); + + tipc_node_unlock(node); + + return 0; +} diff --git a/net/tipc/link.h b/net/tipc/link.h index 3738ba11ce00..f463e7be801c 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -243,6 +243,7 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); /* diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index d4444e59bf98..68bfd11f232d 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -149,6 +149,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_LINK_SET, .doit = tipc_nl_link_set, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_LINK_RESET_STATS, + .doit = tipc_nl_link_reset_stats, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 46f15c6794fb744bb7741d26143a85b9012c10d4 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:15 +0100 Subject: tipc: add media get/dump to new netlink api Add TIPC_NL_MEDIA_GET command to the new tipc netlink API. This command supports dumping all information about all defined media as well as getting all information about a specific media. The information about a media includes name and link properties. Netlink logical layout of media get response message: -> media -> name -> link properties -> tolerance -> priority -> window Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 12 ++++ net/tipc/bearer.c | 125 ++++++++++++++++++++++++++++++++++++++ net/tipc/bearer.h | 3 + net/tipc/netlink.c | 7 +++ 4 files changed, 147 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 574882a301bf..4101b70991cc 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -50,6 +50,7 @@ enum { TIPC_NL_LINK_GET, TIPC_NL_LINK_SET, TIPC_NL_LINK_RESET_STATS, + TIPC_NL_MEDIA_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -62,6 +63,7 @@ enum { TIPC_NLA_SOCK, /* nest */ TIPC_NLA_PUBL, /* nest */ TIPC_NLA_LINK, /* nest */ + TIPC_NLA_MEDIA, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -108,6 +110,16 @@ enum { TIPC_NLA_LINK_MAX = __TIPC_NLA_LINK_MAX - 1 }; +/* Media info */ +enum { + TIPC_NLA_MEDIA_UNSPEC, + TIPC_NLA_MEDIA_NAME, /* string */ + TIPC_NLA_MEDIA_PROP, /* nest */ + + __TIPC_NLA_MEDIA_MAX, + TIPC_NLA_MEDIA_MAX = __TIPC_NLA_MEDIA_MAX - 1 +}; + /* Publication info */ enum { TIPC_NLA_PUBL_UNSPEC, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 6d547d0b3f31..26630a51fcae 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -61,6 +61,12 @@ tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } }; +static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = { + [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING }, + [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED } +}; + struct tipc_bearer __rcu *bearer_list[MAX_BEARERS + 1]; static void bearer_disable(struct tipc_bearer *b_ptr, bool shutting_down); @@ -898,3 +904,122 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) return 0; } + +int __tipc_nl_add_media(struct tipc_nl_msg *msg, struct tipc_media *media) +{ + void *hdr; + struct nlattr *attrs; + struct nlattr *prop; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_v2_family, + NLM_F_MULTI, TIPC_NL_MEDIA_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_MEDIA); + if (!attrs) + goto msg_full; + + if (nla_put_string(msg->skb, TIPC_NLA_MEDIA_NAME, media->name)) + goto attr_msg_full; + + prop = nla_nest_start(msg->skb, TIPC_NLA_MEDIA_PROP); + if (!prop) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, media->priority)) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) + goto prop_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + goto prop_msg_full; + + nla_nest_end(msg->skb, prop); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +prop_msg_full: + nla_nest_cancel(msg->skb, prop); +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + int i = cb->args[0]; + struct tipc_nl_msg msg; + + if (i == MAX_MEDIA) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rtnl_lock(); + for (; media_info_array[i] != NULL; i++) { + err = __tipc_nl_add_media(&msg, media_info_array[i]); + if (err) + break; + } + rtnl_unlock(); + + cb->args[0] = i; + return skb->len; +} + +int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_nl_msg msg; + struct tipc_media *media; + struct sk_buff *rep; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + + if (!info->attrs[TIPC_NLA_MEDIA]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX, + info->attrs[TIPC_NLA_MEDIA], + tipc_nl_media_policy); + if (err) + return err; + + if (!attrs[TIPC_NLA_MEDIA_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]); + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + rtnl_lock(); + media = tipc_media_find(name); + if (!media) { + err = -EINVAL; + goto err_out; + } + + err = __tipc_nl_add_media(&msg, media); + if (err) + goto err_out; + rtnl_unlock(); + + return genlmsg_reply(rep, info); +err_out: + rtnl_unlock(); + nlmsg_free(rep); + + return err; +} diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index c6cf9df31375..bacd225bccfb 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -184,6 +184,9 @@ int tipc_nl_bearer_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); + int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 68bfd11f232d..742e51a0afbb 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -77,6 +77,7 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_SOCK] = { .type = NLA_NESTED, }, [TIPC_NLA_PUBL] = { .type = NLA_NESTED, }, [TIPC_NLA_LINK] = { .type = NLA_NESTED, }, + [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, }, }; /* Legacy ASCII API */ @@ -154,6 +155,12 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_LINK_RESET_STATS, .doit = tipc_nl_link_reset_stats, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_MEDIA_GET, + .doit = tipc_nl_media_get, + .dumpit = tipc_nl_media_dump, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 1e55417d8fc6f6d93b1cc6995b911d48ded2adfb Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:16 +0100 Subject: tipc: add media set to new netlink api Add TIPC_NL_MEDIA_SET command to the new tipc netlink API. This command can set one or more link properties for a particular media. Netlink logical layout of bearer set message: -> media -> name -> link properties [ -> tolerance ] [ -> priority ] [ -> window ] Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + net/tipc/bearer.c | 47 +++++++++++++++++++++++++++++++++++++++ net/tipc/bearer.h | 1 + net/tipc/netlink.c | 5 +++++ 4 files changed, 54 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 4101b70991cc..43beef28d67f 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -51,6 +51,7 @@ enum { TIPC_NL_LINK_SET, TIPC_NL_LINK_RESET_STATS, TIPC_NL_MEDIA_GET, + TIPC_NL_MEDIA_SET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 26630a51fcae..5f6f32369c16 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1023,3 +1023,50 @@ err_out: return err; } + +int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + char *name; + struct tipc_media *m; + struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1]; + + if (!info->attrs[TIPC_NLA_MEDIA]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX, + info->attrs[TIPC_NLA_MEDIA], + tipc_nl_media_policy); + + if (!attrs[TIPC_NLA_MEDIA_NAME]) + return -EINVAL; + name = nla_data(attrs[TIPC_NLA_MEDIA_NAME]); + + rtnl_lock(); + m = tipc_media_find(name); + if (!m) { + rtnl_unlock(); + return -EINVAL; + } + + if (attrs[TIPC_NLA_MEDIA_PROP]) { + struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; + + err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_MEDIA_PROP], + props); + if (err) { + rtnl_unlock(); + return err; + } + + if (props[TIPC_NLA_PROP_TOL]) + m->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]); + if (props[TIPC_NLA_PROP_PRIO]) + m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); + if (props[TIPC_NLA_PROP_WIN]) + m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + } + rtnl_unlock(); + + return 0; +} diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index bacd225bccfb..b1d905209e83 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -186,6 +186,7 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_media_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info); int tipc_media_set_priority(const char *name, u32 new_value); int tipc_media_set_window(const char *name, u32 new_value); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 742e51a0afbb..8673e370b7ed 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -161,6 +161,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_media_get, .dumpit = tipc_nl_media_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_MEDIA_SET, + .doit = tipc_nl_media_set, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 3e4b6ab58d614934e7ca99bdf448089695d34ffa Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:17 +0100 Subject: tipc: add node get/dump to new netlink api Add TIPC_NL_NODE_GET to the new tipc netlink API. This command can dump the address and node status of all nodes in the tipc cluster. Netlink logical layout of returned node/address data: -> node -> address -> up flag Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 12 +++++ net/tipc/netlink.c | 7 +++ net/tipc/node.c | 96 +++++++++++++++++++++++++++++++++++++++ net/tipc/node.h | 4 +- 4 files changed, 118 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 43beef28d67f..08a793374e8c 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -52,6 +52,7 @@ enum { TIPC_NL_LINK_RESET_STATS, TIPC_NL_MEDIA_GET, TIPC_NL_MEDIA_SET, + TIPC_NL_NODE_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -65,6 +66,7 @@ enum { TIPC_NLA_PUBL, /* nest */ TIPC_NLA_LINK, /* nest */ TIPC_NLA_MEDIA, /* nest */ + TIPC_NLA_NODE, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -121,6 +123,16 @@ enum { TIPC_NLA_MEDIA_MAX = __TIPC_NLA_MEDIA_MAX - 1 }; +/* Node info */ +enum { + TIPC_NLA_NODE_UNSPEC, + TIPC_NLA_NODE_ADDR, /* u32 */ + TIPC_NLA_NODE_UP, /* flag */ + + __TIPC_NLA_NODE_MAX, + TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 +}; + /* Publication info */ enum { TIPC_NLA_PUBL_UNSPEC, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 8673e370b7ed..5b0e3c8457d2 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -39,6 +39,7 @@ #include "socket.h" #include "bearer.h" #include "link.h" +#include "node.h" #include static int handle_cmd(struct sk_buff *skb, struct genl_info *info) @@ -78,6 +79,7 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_PUBL] = { .type = NLA_NESTED, }, [TIPC_NLA_LINK] = { .type = NLA_NESTED, }, [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, }, + [TIPC_NLA_NODE] = { .type = NLA_NESTED, } }; /* Legacy ASCII API */ @@ -166,6 +168,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_MEDIA_SET, .doit = tipc_nl_media_set, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_NODE_GET, + .dumpit = tipc_nl_node_dump, + .policy = tipc_nl_policy, } }; diff --git a/net/tipc/node.c b/net/tipc/node.c index 5781634e957d..72a75d4838b2 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -58,6 +58,12 @@ struct tipc_sock_conn { struct list_head list; }; +static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = { + [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 }, + [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG } +}; + /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -601,3 +607,93 @@ void tipc_node_unlock(struct tipc_node *node) tipc_nametbl_withdraw(TIPC_LINK_STATE, addr, link_id, addr); } + +/* Caller should hold node lock for the passed node */ +int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) +{ + void *hdr; + struct nlattr *attrs; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_v2_family, + NLM_F_MULTI, TIPC_NL_NODE_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NODE); + if (!attrs) + goto msg_full; + + if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) + goto attr_msg_full; + if (tipc_node_is_up(node)) + if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + int done = cb->args[0]; + int last_addr = cb->args[1]; + struct tipc_node *node; + struct tipc_nl_msg msg; + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + rcu_read_lock(); + + if (last_addr && !tipc_node_find(last_addr)) { + rcu_read_unlock(); + /* We never set seq or call nl_dump_check_consistent() this + * means that setting prev_seq here will cause the consistence + * check to fail in the netlink callback handler. Resulting in + * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if + * the node state changed while we released the lock. + */ + cb->prev_seq = 1; + return -EPIPE; + } + + list_for_each_entry_rcu(node, &tipc_node_list, list) { + if (last_addr) { + if (node->addr == last_addr) + last_addr = 0; + else + continue; + } + + tipc_node_lock(node); + err = __tipc_nl_add_node(&msg, node); + if (err) { + last_addr = node->addr; + tipc_node_unlock(node); + goto out; + } + + tipc_node_unlock(node); + } + done = 1; +out: + cb->args[0] = done; + cb->args[1] = last_addr; + rcu_read_unlock(); + + return skb->len; +} diff --git a/net/tipc/node.h b/net/tipc/node.h index 04e91458bb29..005fbcef3212 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -1,7 +1,7 @@ /* * net/tipc/node.h: Include file for TIPC node management routines * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2014, Wind River Systems * All rights reserved. * @@ -145,6 +145,8 @@ void tipc_node_unlock(struct tipc_node *node); int tipc_node_add_conn(u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(u32 dnode, u32 port); +int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); + static inline void tipc_node_lock(struct tipc_node *node) { spin_lock_bh(&node->lock); -- cgit v1.2.3 From fd3cf2ad519f73c2f7a46460ebedf32ad246520c Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:18 +0100 Subject: tipc: add net dump to new netlink api Add TIPC_NL_NET_GET command to the new tipc netlink API. This command dumps the network id of the node. Netlink logical layout of returned network data: -> net -> id Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 11 ++++++++ net/tipc/net.c | 59 +++++++++++++++++++++++++++++++++++++++ net/tipc/net.h | 7 ++++- net/tipc/netlink.c | 9 +++++- 4 files changed, 84 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 08a793374e8c..dcfd42002da7 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -53,6 +53,7 @@ enum { TIPC_NL_MEDIA_GET, TIPC_NL_MEDIA_SET, TIPC_NL_NODE_GET, + TIPC_NL_NET_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -67,6 +68,7 @@ enum { TIPC_NLA_LINK, /* nest */ TIPC_NLA_MEDIA, /* nest */ TIPC_NLA_NODE, /* nest */ + TIPC_NLA_NET, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -133,6 +135,15 @@ enum { TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 }; +/* Net info */ +enum { + TIPC_NLA_NET_UNSPEC, + TIPC_NLA_NET_ID, /* u32 */ + + __TIPC_NLA_NET_MAX, + TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 +}; + /* Publication info */ enum { TIPC_NLA_PUBL_UNSPEC, diff --git a/net/tipc/net.c b/net/tipc/net.c index 93b9944a6a8b..d9e666a1be9d 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -42,6 +42,11 @@ #include "node.h" #include "config.h" +static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { + [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NET_ID] = { .type = NLA_U32 } +}; + /* * The TIPC locking policy is designed to ensure a very fine locking * granularity, permitting complete parallel access to individual @@ -138,3 +143,57 @@ void tipc_net_stop(void) pr_info("Left network mode\n"); } + +static int __tipc_nl_add_net(struct tipc_nl_msg *msg) +{ + void *hdr; + struct nlattr *attrs; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_v2_family, + NLM_F_MULTI, TIPC_NL_NET_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + if (!attrs) + goto msg_full; + + if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tipc_net_id)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + int done = cb->args[0]; + struct tipc_nl_msg msg; + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + err = __tipc_nl_add_net(&msg); + if (err) + goto out; + + done = 1; +out: + cb->args[0] = done; + + return skb->len; +} diff --git a/net/tipc/net.h b/net/tipc/net.h index 59ef3388be2c..60dc22fe9267 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -1,7 +1,7 @@ /* * net/tipc/net.h: Include file for TIPC network routing code * - * Copyright (c) 1995-2006, Ericsson AB + * Copyright (c) 1995-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -37,7 +37,12 @@ #ifndef _TIPC_NET_H #define _TIPC_NET_H +#include + int tipc_net_start(u32 addr); + void tipc_net_stop(void); +int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); + #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 5b0e3c8457d2..c143f9c20f61 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -40,6 +40,7 @@ #include "bearer.h" #include "link.h" #include "node.h" +#include "net.h" #include static int handle_cmd(struct sk_buff *skb, struct genl_info *info) @@ -79,7 +80,8 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_PUBL] = { .type = NLA_NESTED, }, [TIPC_NLA_LINK] = { .type = NLA_NESTED, }, [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, }, - [TIPC_NLA_NODE] = { .type = NLA_NESTED, } + [TIPC_NLA_NODE] = { .type = NLA_NESTED, }, + [TIPC_NLA_NET] = { .type = NLA_NESTED, } }; /* Legacy ASCII API */ @@ -173,6 +175,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_NODE_GET, .dumpit = tipc_nl_node_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_NET_GET, + .dumpit = tipc_nl_net_dump, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 27c21416727af73df45051acb05331c0f10e50f6 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:19 +0100 Subject: tipc: add net set to new netlink api Add TIPC_NL_NET_SET command to the new tipc netlink API. This command can set the network id and network (tipc) address. Netlink logical layout of network set message: -> net [ -> id ] [ -> address ] Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 ++ net/tipc/net.c | 47 +++++++++++++++++++++++++++++++++++++++ net/tipc/net.h | 1 + net/tipc/netlink.c | 5 +++++ 4 files changed, 55 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index dcfd42002da7..fb980e27f425 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -54,6 +54,7 @@ enum { TIPC_NL_MEDIA_SET, TIPC_NL_NODE_GET, TIPC_NL_NET_GET, + TIPC_NL_NET_SET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -139,6 +140,7 @@ enum { enum { TIPC_NLA_NET_UNSPEC, TIPC_NLA_NET_ID, /* u32 */ + TIPC_NLA_NET_ADDR, /* u32 */ __TIPC_NLA_NET_MAX, TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 diff --git a/net/tipc/net.c b/net/tipc/net.c index d9e666a1be9d..cf13df3cde8f 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -197,3 +197,50 @@ out: return skb->len; } + +int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) +{ + int err; + struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; + + if (!info->attrs[TIPC_NLA_NET]) + return -EINVAL; + + err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, + info->attrs[TIPC_NLA_NET], + tipc_nl_net_policy); + if (err) + return err; + + if (attrs[TIPC_NLA_NET_ID]) { + u32 val; + + /* Can't change net id once TIPC has joined a network */ + if (tipc_own_addr) + return -EPERM; + + val = nla_get_u32(attrs[TIPC_NLA_NET_ID]); + if (val < 1 || val > 9999) + return -EINVAL; + + tipc_net_id = val; + } + + if (attrs[TIPC_NLA_NET_ADDR]) { + u32 addr; + + /* Can't change net addr once TIPC has joined a network */ + if (tipc_own_addr) + return -EPERM; + + addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); + if (!tipc_addr_node_valid(addr)) + return -EINVAL; + + rtnl_lock(); + tipc_net_start(addr); + rtnl_unlock(); + } + + return 0; +} diff --git a/net/tipc/net.h b/net/tipc/net.h index 60dc22fe9267..a81c1b9eb150 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -44,5 +44,6 @@ int tipc_net_start(u32 addr); void tipc_net_stop(void); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); +int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index c143f9c20f61..cb37d30378a8 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -180,6 +180,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_NET_GET, .dumpit = tipc_nl_net_dump, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_NET_SET, + .doit = tipc_nl_net_set, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 1593123a6a4914ccac4699d7f93cdf8057a7d822 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Thu, 20 Nov 2014 10:29:20 +0100 Subject: tipc: add name table dump to new netlink api Add TIPC_NL_NAME_TABLE_GET command to the new tipc netlink API. This command supports dumping the name table of all nodes. Netlink logical layout of name table response message: -> name table -> publication -> type -> lower -> upper -> scope -> node -> ref -> key Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Reviewed-by: Jon Maloy Acked-by: Ying Xue Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 11 +++ net/tipc/name_table.c | 190 +++++++++++++++++++++++++++++++++++++- net/tipc/name_table.h | 4 +- net/tipc/netlink.c | 9 +- 4 files changed, 211 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index fb980e27f425..8d723824ad69 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -55,6 +55,7 @@ enum { TIPC_NL_NODE_GET, TIPC_NL_NET_GET, TIPC_NL_NET_SET, + TIPC_NL_NAME_TABLE_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -70,6 +71,7 @@ enum { TIPC_NLA_MEDIA, /* nest */ TIPC_NLA_NODE, /* nest */ TIPC_NLA_NET, /* nest */ + TIPC_NLA_NAME_TABLE, /* nest */ __TIPC_NLA_MAX, TIPC_NLA_MAX = __TIPC_NLA_MAX - 1 @@ -146,6 +148,15 @@ enum { TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 }; +/* Name table info */ +enum { + TIPC_NLA_NAME_TABLE_UNSPEC, + TIPC_NLA_NAME_TABLE_PUBL, /* nest */ + + __TIPC_NLA_NAME_TABLE_MAX, + TIPC_NLA_NAME_TABLE_MAX = __TIPC_NLA_NAME_TABLE_MAX - 1 +}; + /* Publication info */ enum { TIPC_NLA_PUBL_UNSPEC, diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 3a6a0a7c0759..30ca8e0e0f84 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -1,7 +1,7 @@ /* * net/tipc/name_table.c: TIPC name table code * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2004-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -42,6 +42,12 @@ #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ +static const struct nla_policy +tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = { + [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED } +}; + /** * struct name_info - name sequence publication info * @node_list: circular list of publications made by own node @@ -995,3 +1001,185 @@ void tipc_nametbl_stop(void) table.types = NULL; write_unlock_bh(&tipc_nametbl_lock); } + +int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct name_seq *seq, + struct sub_seq *sseq, u32 *last_publ) +{ + void *hdr; + struct nlattr *attrs; + struct nlattr *publ; + struct publication *p; + + if (*last_publ) { + list_for_each_entry(p, &sseq->info->zone_list, zone_list) + if (p->key == *last_publ) + break; + if (p->key != *last_publ) + return -EPIPE; + } else { + p = list_first_entry(&sseq->info->zone_list, struct publication, + zone_list); + } + + list_for_each_entry_from(p, &sseq->info->zone_list, zone_list) { + *last_publ = p->key; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, + &tipc_genl_v2_family, NLM_F_MULTI, + TIPC_NL_NAME_TABLE_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE); + if (!attrs) + goto msg_full; + + publ = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); + if (!publ) + goto attr_msg_full; + + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, seq->type)) + goto publ_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sseq->lower)) + goto publ_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sseq->upper)) + goto publ_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) + goto publ_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node)) + goto publ_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->ref)) + goto publ_msg_full; + if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) + goto publ_msg_full; + + nla_nest_end(msg->skb, publ); + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + } + *last_publ = 0; + + return 0; + +publ_msg_full: + nla_nest_cancel(msg->skb, publ); +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int __tipc_nl_subseq_list(struct tipc_nl_msg *msg, struct name_seq *seq, + u32 *last_lower, u32 *last_publ) +{ + struct sub_seq *sseq; + struct sub_seq *sseq_start; + int err; + + if (*last_lower) { + sseq_start = nameseq_find_subseq(seq, *last_lower); + if (!sseq_start) + return -EPIPE; + } else { + sseq_start = seq->sseqs; + } + + for (sseq = sseq_start; sseq != &seq->sseqs[seq->first_free]; sseq++) { + err = __tipc_nl_add_nametable_publ(msg, seq, sseq, last_publ); + if (err) { + *last_lower = sseq->lower; + return err; + } + } + *last_lower = 0; + + return 0; +} + +int __tipc_nl_seq_list(struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, + u32 *last_publ) +{ + struct hlist_head *seq_head; + struct name_seq *seq; + int err; + int i; + + if (*last_type) + i = hash(*last_type); + else + i = 0; + + for (; i < TIPC_NAMETBL_SIZE; i++) { + seq_head = &table.types[i]; + + if (*last_type) { + seq = nametbl_find_seq(*last_type); + if (!seq) + return -EPIPE; + } else { + seq = hlist_entry_safe((seq_head)->first, + struct name_seq, ns_list); + if (!seq) + continue; + } + + hlist_for_each_entry_from(seq, ns_list) { + spin_lock_bh(&seq->lock); + + err = __tipc_nl_subseq_list(msg, seq, last_lower, + last_publ); + + if (err) { + *last_type = seq->type; + spin_unlock_bh(&seq->lock); + return err; + } + spin_unlock_bh(&seq->lock); + } + *last_type = 0; + } + return 0; +} + +int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int err; + int done = cb->args[3]; + u32 last_type = cb->args[0]; + u32 last_lower = cb->args[1]; + u32 last_publ = cb->args[2]; + struct tipc_nl_msg msg; + + if (done) + return 0; + + msg.skb = skb; + msg.portid = NETLINK_CB(cb->skb).portid; + msg.seq = cb->nlh->nlmsg_seq; + + read_lock_bh(&tipc_nametbl_lock); + + err = __tipc_nl_seq_list(&msg, &last_type, &last_lower, &last_publ); + if (!err) { + done = 1; + } else if (err != -EMSGSIZE) { + /* We never set seq or call nl_dump_check_consistent() this + * means that setting prev_seq here will cause the consistence + * check to fail in the netlink callback handler. Resulting in + * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if + * we got an error. + */ + cb->prev_seq = 1; + } + + read_unlock_bh(&tipc_nametbl_lock); + + cb->args[0] = last_type; + cb->args[1] = last_lower; + cb->args[2] = last_publ; + cb->args[3] = done; + + return skb->len; +} diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f02f48b9a216..b38ebecac766 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -1,7 +1,7 @@ /* * net/tipc/name_table.h: Include file for TIPC name table code * - * Copyright (c) 2000-2006, Ericsson AB + * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * All rights reserved. * @@ -84,6 +84,8 @@ struct publication { extern rwlock_t tipc_nametbl_lock; +int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); + struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space); u32 tipc_nametbl_translate(u32 type, u32 instance, u32 *node); int tipc_nametbl_mc_translate(u32 type, u32 lower, u32 upper, u32 limit, diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index cb37d30378a8..b891e3905bc4 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -37,6 +37,7 @@ #include "core.h" #include "config.h" #include "socket.h" +#include "name_table.h" #include "bearer.h" #include "link.h" #include "node.h" @@ -81,7 +82,8 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = { [TIPC_NLA_LINK] = { .type = NLA_NESTED, }, [TIPC_NLA_MEDIA] = { .type = NLA_NESTED, }, [TIPC_NLA_NODE] = { .type = NLA_NESTED, }, - [TIPC_NLA_NET] = { .type = NLA_NESTED, } + [TIPC_NLA_NET] = { .type = NLA_NESTED, }, + [TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, } }; /* Legacy ASCII API */ @@ -185,6 +187,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .cmd = TIPC_NL_NET_SET, .doit = tipc_nl_net_set, .policy = tipc_nl_policy, + }, + { + .cmd = TIPC_NL_NAME_TABLE_GET, + .dumpit = tipc_nl_name_table_dump, + .policy = tipc_nl_policy, } }; -- cgit v1.2.3 From 2ad7bf3638411cb547f2823df08166c13ab04269 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Sun, 23 Nov 2014 23:07:46 -0800 Subject: ipvlan: Initial check-in of the IPVLAN driver. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This driver is very similar to the macvlan driver except that it uses L3 on the frame to determine the logical interface while functioning as packet dispatcher. It inherits L2 of the master device hence the packets on wire will have the same L2 for all the packets originating from all virtual devices off of the same master device. This driver was developed keeping the namespace use-case in mind. Hence most of the examples given here take that as the base setup where main-device belongs to the default-ns and virtual devices are assigned to the additional namespaces. The device operates in two different modes and the difference in these two modes in primarily in the TX side. (a) L2 mode : In this mode, the device behaves as a L2 device. TX processing upto L2 happens on the stack of the virtual device associated with (namespace). Packets are switched after that into the main device (default-ns) and queued for xmit. RX processing is simple and all multicast, broadcast (if applicable), and unicast belonging to the address(es) are delivered to the virtual devices. (b) L3 mode : In this mode, the device behaves like a L3 device. TX processing upto L3 happens on the stack of the virtual device associated with (namespace). Packets are switched to the main-device (default-ns) for the L2 processing. Hence the routing table of the default-ns will be used in this mode. RX processins is somewhat similar to the L2 mode except that in this mode only Unicast packets are delivered to the virtual device while main-dev will handle all other packets. The devices can be added using the "ip" command from the iproute2 package - ip link add link type ipvlan mode [ l2 | l3 ] Signed-off-by: Mahesh Bandewar Cc: Eric Dumazet Cc: Maciej Żenczykowski Cc: Laurent Chavey Cc: Tim Hockin Cc: Brandon Philips Cc: Pavel Emelianov Signed-off-by: David S. Miller --- Documentation/networking/ipvlan.txt | 107 +++++ drivers/net/Kconfig | 18 + drivers/net/Makefile | 1 + drivers/net/ipvlan/Makefile | 7 + drivers/net/ipvlan/ipvlan.h | 130 ++++++ drivers/net/ipvlan/ipvlan_core.c | 607 +++++++++++++++++++++++++++ drivers/net/ipvlan/ipvlan_main.c | 789 ++++++++++++++++++++++++++++++++++++ include/linux/netdevice.h | 4 + include/uapi/linux/if_link.h | 15 + 9 files changed, 1678 insertions(+) create mode 100644 Documentation/networking/ipvlan.txt create mode 100644 drivers/net/ipvlan/Makefile create mode 100644 drivers/net/ipvlan/ipvlan.h create mode 100644 drivers/net/ipvlan/ipvlan_core.c create mode 100644 drivers/net/ipvlan/ipvlan_main.c (limited to 'include/uapi') diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt new file mode 100644 index 000000000000..cf996394e466 --- /dev/null +++ b/Documentation/networking/ipvlan.txt @@ -0,0 +1,107 @@ + + IPVLAN Driver HOWTO + +Initial Release: + Mahesh Bandewar + +1. Introduction: + This is conceptually very similar to the macvlan driver with one major +exception of using L3 for mux-ing /demux-ing among slaves. This property makes +the master device share the L2 with it's slave devices. I have developed this +driver in conjuntion with network namespaces and not sure if there is use case +outside of it. + + +2. Building and Installation: + In order to build the driver, please select the config item CONFIG_IPVLAN. +The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module +(CONFIG_IPVLAN=m). + + +3. Configuration: + There are no module parameters for this driver and it can be configured +using IProute2/ip utility. + + ip link add link type ipvlan mode { l2 | L3 } + + e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 + + +4. Operating modes: + IPvlan has two modes of operation - L2 and L3. For a given master device, +you can select one of these two modes and all slaves on that master will +operate in the same (selected) mode. The RX mode is almost identical except +that in L3 mode the slaves wont receive any multicast / broadcast traffic. +L3 mode is more restrictive since routing is controlled from the other (mostly) +default namespace. + +4.1 L2 mode: + In this mode TX processing happens on the stack instance attached to the +slave device and packets are switched and queued to the master device to send +out. In this mode the slaves will RX/TX multicast and broadcast (if applicable) +as well. + +4.2 L3 mode: + In this mode TX processing upto L3 happens on the stack instance attached +to the slave device and packets are switched to the stack instance of the +master device for the L2 processing and routing from that instance will be +used before packets are queued on the outbound device. In this mode the slaves +will not receive nor can send multicast / broadcast traffic. + + +5. What to choose (macvlan vs. ipvlan)? + These two devices are very similar in many regards and the specific use +case could very well define which device to choose. if one of the following +situations defines your use case then you can choose to use ipvlan - + (a) The Linux host that is connected to the external switch / router has +policy configured that allows only one mac per port. + (b) No of virtual devices created on a master exceed the mac capacity and +puts the NIC in promiscous mode and degraded performance is a concern. + (c) If the slave device is to be put into the hostile / untrusted network +namespace where L2 on the slave could be changed / misused. + + +6. Example configuration: + + +=============================================================+ + | Host: host1 | + | | + | +----------------------+ +----------------------+ | + | | NS:ns0 | | NS:ns1 | | + | | | | | | + | | | | | | + | | ipvl0 | | ipvl1 | | + | +----------#-----------+ +-----------#----------+ | + | # # | + | ################################ | + | # eth0 | + +==============================#==============================+ + + + (a) Create two network namespaces - ns0, ns1 + ip netns add ns0 + ip netns add ns1 + + (b) Create two ipvlan slaves on eth0 (master device) + ip link add link eth0 ipvl0 type ipvlan mode l2 + ip link add link eth0 ipvl1 type ipvlan mode l2 + + (c) Assign slaves to the respective network namespaces + ip link set dev ipvl0 netns ns0 + ip link set dev ipvl1 netns ns1 + + (d) Now switch to the namespace (ns0 or ns1) to configure the slave devices + - For ns0 + (1) ip netns exec ns0 bash + (2) ip link set dev ipvl0 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl0 + (6) ip -4 route add default via $ROUTER dev ipvl0 + - For ns1 + (1) ip netns exec ns1 bash + (2) ip link set dev ipvl1 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl1 + (6) ip -4 route add default via $ROUTER dev ipvl1 diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index f9009be3f307..b6d64f546574 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -145,6 +145,24 @@ config MACVTAP To compile this driver as a module, choose M here: the module will be called macvtap. + +config IPVLAN + tristate "IP-VLAN support" + ---help--- + This allows one to create virtual devices off of a main interface + and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) + on packets. All interfaces (including the main interface) share L2 + making it transparent to the connected L2 switch. + + Ipvlan devices can be added using the "ip" command from the + iproute2 package starting with the iproute2-X.Y.ZZ release: + + "ip link add link [ NAME ] type ipvlan" + + To compile this driver as a module, choose M here: the module + will be called ipvlan. + + config VXLAN tristate "Virtual eXtensible Local Area Network (VXLAN)" depends on INET diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 61aefdd1e173..e25fdd7d905e 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -6,6 +6,7 @@ # Networking Core Drivers # obj-$(CONFIG_BONDING) += bonding/ +obj-$(CONFIG_IPVLAN) += ipvlan/ obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_EQUALIZER) += eql.o obj-$(CONFIG_IFB) += ifb.o diff --git a/drivers/net/ipvlan/Makefile b/drivers/net/ipvlan/Makefile new file mode 100644 index 000000000000..df79910192d6 --- /dev/null +++ b/drivers/net/ipvlan/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Ethernet Ipvlan driver +# + +obj-$(CONFIG_IPVLAN) += ipvlan.o + +ipvlan-objs := ipvlan_core.o ipvlan_main.o diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h new file mode 100644 index 000000000000..ab3e7614ed71 --- /dev/null +++ b/drivers/net/ipvlan/ipvlan.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 Mahesh Bandewar + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ +#ifndef __IPVLAN_H +#define __IPVLAN_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPVLAN_DRV "ipvlan" +#define IPV_DRV_VER "0.1" + +#define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) +#define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) + +#define IPVLAN_MAC_FILTER_BITS 8 +#define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) +#define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) + +typedef enum { + IPVL_IPV6 = 0, + IPVL_ICMPV6, + IPVL_IPV4, + IPVL_ARP, +} ipvl_hdr_type; + +struct ipvl_pcpu_stats { + u64 rx_pkts; + u64 rx_bytes; + u64 rx_mcast; + u64 tx_pkts; + u64 tx_bytes; + struct u64_stats_sync syncp; + u32 rx_errs; + u32 tx_drps; +}; + +struct ipvl_port; + +struct ipvl_dev { + struct net_device *dev; + struct list_head pnode; + struct ipvl_port *port; + struct net_device *phy_dev; + struct list_head addrs; + int ipv4cnt; + int ipv6cnt; + struct ipvl_pcpu_stats *pcpu_stats; + DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); + netdev_features_t sfeatures; + u32 msg_enable; + u16 mtu_adj; +}; + +struct ipvl_addr { + struct ipvl_dev *master; /* Back pointer to master */ + union { + struct in6_addr ip6; /* IPv6 address on logical interface */ + struct in_addr ip4; /* IPv4 address on logical interface */ + } ipu; +#define ip6addr ipu.ip6 +#define ip4addr ipu.ip4 + struct hlist_node hlnode; /* Hash-table linkage */ + struct list_head anode; /* logical-interface linkage */ + struct rcu_head rcu; + ipvl_hdr_type atype; +}; + +struct ipvl_port { + struct net_device *dev; + struct hlist_head hlhead[IPVLAN_HASH_SIZE]; + struct list_head ipvlans; + struct rcu_head rcu; + int count; + u16 mode; +}; + +static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) +{ + return rcu_dereference(d->rx_handler_data); +} + +static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) +{ + return rtnl_dereference(d->rx_handler_data); +} + +static inline bool ipvlan_dev_master(struct net_device *d) +{ + return d->priv_flags & IFF_IPVLAN_MASTER; +} + +static inline bool ipvlan_dev_slave(struct net_device *d) +{ + return d->priv_flags & IFF_IPVLAN_SLAVE; +} + +void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev); +void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval); +void ipvlan_init_secret(void); +unsigned int ipvlan_mac_hash(const unsigned char *addr); +rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); +int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); +void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); +bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); +struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, + const void *iaddr, bool is_v6); +void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync); +#endif /* __IPVLAN_H */ diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c new file mode 100644 index 000000000000..a14d87783245 --- /dev/null +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -0,0 +1,607 @@ +/* Copyright (c) 2014 Mahesh Bandewar + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ + +#include "ipvlan.h" + +static u32 ipvlan_jhash_secret; + +void ipvlan_init_secret(void) +{ + net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); +} + +static void ipvlan_count_rx(const struct ipvl_dev *ipvlan, + unsigned int len, bool success, bool mcast) +{ + if (!ipvlan) + return; + + if (likely(success)) { + struct ipvl_pcpu_stats *pcptr; + + pcptr = this_cpu_ptr(ipvlan->pcpu_stats); + u64_stats_update_begin(&pcptr->syncp); + pcptr->rx_pkts++; + pcptr->rx_bytes += len; + if (mcast) + pcptr->rx_mcast++; + u64_stats_update_end(&pcptr->syncp); + } else { + this_cpu_inc(ipvlan->pcpu_stats->rx_errs); + } +} + +static u8 ipvlan_get_v6_hash(const void *iaddr) +{ + const struct in6_addr *ip6_addr = iaddr; + + return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & + IPVLAN_HASH_MASK; +} + +static u8 ipvlan_get_v4_hash(const void *iaddr) +{ + const struct in_addr *ip4_addr = iaddr; + + return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & + IPVLAN_HASH_MASK; +} + +struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, + const void *iaddr, bool is_v6) +{ + struct ipvl_addr *addr; + u8 hash; + + hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : + ipvlan_get_v4_hash(iaddr); + hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) { + if (is_v6 && addr->atype == IPVL_IPV6 && + ipv6_addr_equal(&addr->ip6addr, iaddr)) + return addr; + else if (!is_v6 && addr->atype == IPVL_IPV4 && + addr->ip4addr.s_addr == + ((struct in_addr *)iaddr)->s_addr) + return addr; + } + return NULL; +} + +void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) +{ + struct ipvl_port *port = ipvlan->port; + u8 hash; + + hash = (addr->atype == IPVL_IPV6) ? + ipvlan_get_v6_hash(&addr->ip6addr) : + ipvlan_get_v4_hash(&addr->ip4addr); + hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); +} + +void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync) +{ + hlist_del_rcu(&addr->hlnode); + if (sync) + synchronize_rcu(); +} + +bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +{ + struct ipvl_port *port = ipvlan->port; + struct ipvl_addr *addr; + + list_for_each_entry(addr, &ipvlan->addrs, anode) { + if ((is_v6 && addr->atype == IPVL_IPV6 && + ipv6_addr_equal(&addr->ip6addr, iaddr)) || + (!is_v6 && addr->atype == IPVL_IPV4 && + addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr)) + return true; + } + + if (ipvlan_ht_addr_lookup(port, iaddr, is_v6)) + return true; + + return false; +} + +static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type) +{ + void *lyr3h = NULL; + + switch (skb->protocol) { + case htons(ETH_P_ARP): { + struct arphdr *arph; + + if (unlikely(!pskb_may_pull(skb, sizeof(*arph)))) + return NULL; + + arph = arp_hdr(skb); + *type = IPVL_ARP; + lyr3h = arph; + break; + } + case htons(ETH_P_IP): { + u32 pktlen; + struct iphdr *ip4h; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) + return NULL; + + ip4h = ip_hdr(skb); + pktlen = ntohs(ip4h->tot_len); + if (ip4h->ihl < 5 || ip4h->version != 4) + return NULL; + if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) + return NULL; + + *type = IPVL_IPV4; + lyr3h = ip4h; + break; + } + case htons(ETH_P_IPV6): { + struct ipv6hdr *ip6h; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) + return NULL; + + ip6h = ipv6_hdr(skb); + if (ip6h->version != 6) + return NULL; + + *type = IPVL_IPV6; + lyr3h = ip6h; + /* Only Neighbour Solicitation pkts need different treatment */ + if (ipv6_addr_any(&ip6h->saddr) && + ip6h->nexthdr == NEXTHDR_ICMP) { + *type = IPVL_ICMPV6; + lyr3h = ip6h + 1; + } + break; + } + default: + return NULL; + } + + return lyr3h; +} + +unsigned int ipvlan_mac_hash(const unsigned char *addr) +{ + u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), + ipvlan_jhash_secret); + + return hash & IPVLAN_MAC_FILTER_MASK; +} + +static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb, + const struct ipvl_dev *in_dev, bool local) +{ + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_dev *ipvlan; + struct sk_buff *nskb; + unsigned int len; + unsigned int mac_hash; + int ret; + + if (skb->protocol == htons(ETH_P_PAUSE)) + return; + + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + if (local && (ipvlan == in_dev)) + continue; + + mac_hash = ipvlan_mac_hash(eth->h_dest); + if (!test_bit(mac_hash, ipvlan->mac_filters)) + continue; + + ret = NET_RX_DROP; + len = skb->len + ETH_HLEN; + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + goto mcast_acct; + + if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast)) + nskb->pkt_type = PACKET_BROADCAST; + else + nskb->pkt_type = PACKET_MULTICAST; + + nskb->dev = ipvlan->dev; + if (local) + ret = dev_forward_skb(ipvlan->dev, nskb); + else + ret = netif_rx(nskb); +mcast_acct: + ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); + } + + /* Locally generated? ...Forward a copy to the main-device as + * well. On the RX side we'll ignore it (wont give it to any + * of the virtual devices. + */ + if (local) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + if (ether_addr_equal(eth->h_dest, port->dev->broadcast)) + nskb->pkt_type = PACKET_BROADCAST; + else + nskb->pkt_type = PACKET_MULTICAST; + + dev_forward_skb(port->dev, nskb); + } + } +} + +static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb, + bool local) +{ + struct ipvl_dev *ipvlan = addr->master; + struct net_device *dev = ipvlan->dev; + unsigned int len; + rx_handler_result_t ret = RX_HANDLER_CONSUMED; + bool success = false; + + len = skb->len + ETH_HLEN; + if (unlikely(!(dev->flags & IFF_UP))) { + kfree_skb(skb); + goto out; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + + if (local) { + if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) + success = true; + } else { + ret = RX_HANDLER_ANOTHER; + success = true; + } + +out: + ipvlan_count_rx(ipvlan, len, success, false); + return ret; +} + +static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, + void *lyr3h, int addr_type, + bool use_dest) +{ + struct ipvl_addr *addr = NULL; + + if (addr_type == IPVL_IPV6) { + struct ipv6hdr *ip6h; + struct in6_addr *i6addr; + + ip6h = (struct ipv6hdr *)lyr3h; + i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; + addr = ipvlan_ht_addr_lookup(port, i6addr, true); + } else if (addr_type == IPVL_ICMPV6) { + struct nd_msg *ndmh; + struct in6_addr *i6addr; + + /* Make sure that the NeighborSolicitation ICMPv6 packets + * are handled to avoid DAD issue. + */ + ndmh = (struct nd_msg *)lyr3h; + if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { + i6addr = &ndmh->target; + addr = ipvlan_ht_addr_lookup(port, i6addr, true); + } + } else if (addr_type == IPVL_IPV4) { + struct iphdr *ip4h; + __be32 *i4addr; + + ip4h = (struct iphdr *)lyr3h; + i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; + addr = ipvlan_ht_addr_lookup(port, i4addr, false); + } else if (addr_type == IPVL_ARP) { + struct arphdr *arph; + unsigned char *arp_ptr; + __be32 dip; + + arph = (struct arphdr *)lyr3h; + arp_ptr = (unsigned char *)(arph + 1); + if (use_dest) + arp_ptr += (2 * port->dev->addr_len) + 4; + else + arp_ptr += port->dev->addr_len; + + memcpy(&dip, arp_ptr, 4); + addr = ipvlan_ht_addr_lookup(port, &dip, false); + } + + return addr; +} + +static int ipvlan_process_v4_outbound(struct sk_buff *skb) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net_device *dev = skb->dev; + struct rtable *rt; + int err, ret = NET_XMIT_DROP; + struct flowi4 fl4 = { + .flowi4_oif = dev->iflink, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(dev_net(dev), &fl4, NULL); + if (IS_ERR(rt)) + goto err; + + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto err; + } + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + err = ip_local_out(skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} + +static int ipvlan_process_v6_outbound(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net_device *dev = skb->dev; + struct dst_entry *dst; + int err, ret = NET_XMIT_DROP; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_output(dev_net(dev), NULL, &fl6); + if (IS_ERR(dst)) + goto err; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + err = ip6_local_out(skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} + +static int ipvlan_process_outbound(struct sk_buff *skb, + const struct ipvl_dev *ipvlan) +{ + struct ethhdr *ethh = eth_hdr(skb); + int ret = NET_XMIT_DROP; + + /* In this mode we dont care about multicast and broadcast traffic */ + if (is_multicast_ether_addr(ethh->h_dest)) { + pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n", + ntohs(skb->protocol)); + kfree_skb(skb); + goto out; + } + + /* The ipvlan is a pseudo-L2 device, so the packets that we receive + * will have L2; which need to discarded and processed further + * in the net-ns of the main-device. + */ + if (skb_mac_header_was_set(skb)) { + skb_pull(skb, sizeof(*ethh)); + skb->mac_header = (typeof(skb->mac_header))~0U; + skb_reset_network_header(skb); + } + + if (skb->protocol == htons(ETH_P_IPV6)) + ret = ipvlan_process_v6_outbound(skb); + else if (skb->protocol == htons(ETH_P_IP)) + ret = ipvlan_process_v4_outbound(skb); + else { + pr_warn_ratelimited("Dropped outbound packet type=%x\n", + ntohs(skb->protocol)); + kfree_skb(skb); + } +out: + return ret; +} + +static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + void *lyr3h; + struct ipvl_addr *addr; + int addr_type; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) + return ipvlan_rcv_frame(addr, skb, true); + +out: + skb->dev = ipvlan->phy_dev; + return ipvlan_process_outbound(skb, ipvlan); +} + +static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; + + if (ether_addr_equal(eth->h_dest, eth->h_source)) { + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (lyr3h) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) + return ipvlan_rcv_frame(addr, skb, true); + } + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + /* Packet definitely does not belong to any of the + * virtual devices, but the dest is local. So forward + * the skb for the main-dev. At the RX side we just return + * RX_PASS for it to be processed further on the stack. + */ + return dev_forward_skb(ipvlan->phy_dev, skb); + + } else if (is_multicast_ether_addr(eth->h_dest)) { + u8 ip_summed = skb->ip_summed; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true); + skb->ip_summed = ip_summed; + } + + skb->dev = ipvlan->phy_dev; + return dev_queue_xmit(skb); +} + +int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev); + + if (!port) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + goto out; + + switch(port->mode) { + case IPVLAN_MODE_L2: + return ipvlan_xmit_mode_l2(skb, dev); + case IPVLAN_MODE_L3: + return ipvlan_xmit_mode_l3(skb, dev); + } + + /* Should not reach here */ + WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n", + port->mode); +out: + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) +{ + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; + + if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + return true; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); + if (addr) + return false; + } + + return true; +} + +static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, + struct ipvl_port *port) +{ + void *lyr3h; + int addr_type; + struct ipvl_addr *addr; + struct sk_buff *skb = *pskb; + rx_handler_result_t ret = RX_HANDLER_PASS; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); + if (addr) + ret = ipvlan_rcv_frame(addr, skb, false); + +out: + return ret; +} + +static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, + struct ipvl_port *port) +{ + struct sk_buff *skb = *pskb; + struct ethhdr *eth = eth_hdr(skb); + rx_handler_result_t ret = RX_HANDLER_PASS; + void *lyr3h; + int addr_type; + + if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_external_frame(skb, port)) + ipvlan_multicast_frame(port, skb, NULL, false); + } else { + struct ipvl_addr *addr; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + return ret; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); + if (addr) + ret = ipvlan_rcv_frame(addr, skb, false); + } + + return ret; +} + +rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); + + if (!port) + return RX_HANDLER_PASS; + + switch (port->mode) { + case IPVLAN_MODE_L2: + return ipvlan_handle_mode_l2(pskb, port); + case IPVLAN_MODE_L3: + return ipvlan_handle_mode_l3(pskb, port); + } + + /* Should not reach here */ + WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n", + port->mode); + kfree_skb(skb); + return NET_RX_DROP; +} diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c new file mode 100644 index 000000000000..c3df84bd2857 --- /dev/null +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -0,0 +1,789 @@ +/* Copyright (c) 2014 Mahesh Bandewar + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ + +#include "ipvlan.h" + +void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) +{ + ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; +} + +void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval) +{ + struct ipvl_dev *ipvlan; + + if (port->mode != nval) { + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + if (nval == IPVLAN_MODE_L3) + ipvlan->dev->flags |= IFF_NOARP; + else + ipvlan->dev->flags &= ~IFF_NOARP; + } + port->mode = nval; + } +} + +static int ipvlan_port_create(struct net_device *dev) +{ + struct ipvl_port *port; + int err, idx; + + if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) { + netdev_err(dev, "Master is either lo or non-ether device\n"); + return -EINVAL; + } + port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + port->dev = dev; + port->mode = IPVLAN_MODE_L3; + INIT_LIST_HEAD(&port->ipvlans); + for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) + INIT_HLIST_HEAD(&port->hlhead[idx]); + + err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); + if (err) + goto err; + + dev->priv_flags |= IFF_IPVLAN_MASTER; + return 0; + +err: + kfree_rcu(port, rcu); + return err; +} + +static void ipvlan_port_destroy(struct net_device *dev) +{ + struct ipvl_port *port = ipvlan_port_get_rtnl(dev); + + dev->priv_flags &= ~IFF_IPVLAN_MASTER; + netdev_rx_handler_unregister(dev); + kfree_rcu(port, rcu); +} + +/* ipvlan network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key ipvlan_netdev_xmit_lock_key; +static struct lock_class_key ipvlan_netdev_addr_lock_key; + +#define IPVLAN_FEATURES \ + (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \ + NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ + NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) + +#define IPVLAN_STATE_MASK \ + ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) + +static void ipvlan_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &ipvlan_netdev_xmit_lock_key); +} + +static void ipvlan_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &ipvlan_netdev_addr_lock_key); + netdev_for_each_tx_queue(dev, ipvlan_set_lockdep_class_one, NULL); +} + +static int ipvlan_init(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + const struct net_device *phy_dev = ipvlan->phy_dev; + + dev->state = (dev->state & ~IPVLAN_STATE_MASK) | + (phy_dev->state & IPVLAN_STATE_MASK); + dev->features = phy_dev->features & IPVLAN_FEATURES; + dev->features |= NETIF_F_LLTX; + dev->gso_max_size = phy_dev->gso_max_size; + dev->iflink = phy_dev->ifindex; + dev->hard_header_len = phy_dev->hard_header_len; + + ipvlan_set_lockdep_class(dev); + + ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats); + if (!ipvlan->pcpu_stats) + return -ENOMEM; + + return 0; +} + +static void ipvlan_uninit(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan->port; + + if (ipvlan->pcpu_stats) + free_percpu(ipvlan->pcpu_stats); + + port->count -= 1; + if (!port->count) + ipvlan_port_destroy(port->dev); +} + +static int ipvlan_open(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + struct ipvl_addr *addr; + + if (ipvlan->port->mode == IPVLAN_MODE_L3) + dev->flags |= IFF_NOARP; + else + dev->flags &= ~IFF_NOARP; + + if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) { + list_for_each_entry(addr, &ipvlan->addrs, anode) + ipvlan_ht_addr_add(ipvlan, addr); + } + return dev_uc_add(phy_dev, phy_dev->dev_addr); +} + +static int ipvlan_stop(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + struct ipvl_addr *addr; + + dev_uc_unsync(phy_dev, dev); + dev_mc_unsync(phy_dev, dev); + + dev_uc_del(phy_dev, phy_dev->dev_addr); + + if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) { + list_for_each_entry(addr, &ipvlan->addrs, anode) + ipvlan_ht_addr_del(addr, !dev->dismantle); + } + return 0; +} + +netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + int skblen = skb->len; + int ret; + + ret = ipvlan_queue_xmit(skb, dev); + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { + struct ipvl_pcpu_stats *pcptr; + + pcptr = this_cpu_ptr(ipvlan->pcpu_stats); + + u64_stats_update_begin(&pcptr->syncp); + pcptr->tx_pkts++; + pcptr->tx_bytes += skblen; + u64_stats_update_end(&pcptr->syncp); + } else { + this_cpu_inc(ipvlan->pcpu_stats->tx_drps); + } + return ret; +} + +static netdev_features_t ipvlan_fix_features(struct net_device *dev, + netdev_features_t features) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES); +} + +static void ipvlan_change_rx_flags(struct net_device *dev, int change) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + if (change & IFF_ALLMULTI) + dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); +} + +static void ipvlan_set_broadcast_mac_filter(struct ipvl_dev *ipvlan, bool set) +{ + struct net_device *dev = ipvlan->dev; + unsigned int hashbit = ipvlan_mac_hash(dev->broadcast); + + if (set && !test_bit(hashbit, ipvlan->mac_filters)) + __set_bit(hashbit, ipvlan->mac_filters); + else if (!set && test_bit(hashbit, ipvlan->mac_filters)) + __clear_bit(hashbit, ipvlan->mac_filters); +} + +static void ipvlan_set_multicast_mac_filter(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { + bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); + } else { + struct netdev_hw_addr *ha; + DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); + + bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); + netdev_for_each_mc_addr(ha, dev) + __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); + + bitmap_copy(ipvlan->mac_filters, mc_filters, + IPVLAN_MAC_FILTER_SIZE); + } + dev_uc_sync(ipvlan->phy_dev, dev); + dev_mc_sync(ipvlan->phy_dev, dev); +} + +static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (ipvlan->pcpu_stats) { + struct ipvl_pcpu_stats *pcptr; + u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; + u32 rx_errs = 0, tx_drps = 0; + u32 strt; + int idx; + + for_each_possible_cpu(idx) { + pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); + do { + strt= u64_stats_fetch_begin_irq(&pcptr->syncp); + rx_pkts = pcptr->rx_pkts; + rx_bytes = pcptr->rx_bytes; + rx_mcast = pcptr->rx_mcast; + tx_pkts = pcptr->tx_pkts; + tx_bytes = pcptr->tx_bytes; + } while (u64_stats_fetch_retry_irq(&pcptr->syncp, + strt)); + + s->rx_packets += rx_pkts; + s->rx_bytes += rx_bytes; + s->multicast += rx_mcast; + s->tx_packets += tx_pkts; + s->tx_bytes += tx_bytes; + + /* u32 values are updated without syncp protection. */ + rx_errs += pcptr->rx_errs; + tx_drps += pcptr->tx_drps; + } + s->rx_errors = rx_errs; + s->rx_dropped = rx_errs; + s->tx_dropped = tx_drps; + } + return s; +} + +static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + return vlan_vid_add(phy_dev, proto, vid); +} + +static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, + u16 vid) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + vlan_vid_del(phy_dev, proto, vid); + return 0; +} + +static const struct net_device_ops ipvlan_netdev_ops = { + .ndo_init = ipvlan_init, + .ndo_uninit = ipvlan_uninit, + .ndo_open = ipvlan_open, + .ndo_stop = ipvlan_stop, + .ndo_start_xmit = ipvlan_start_xmit, + .ndo_fix_features = ipvlan_fix_features, + .ndo_change_rx_flags = ipvlan_change_rx_flags, + .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, + .ndo_get_stats64 = ipvlan_get_stats64, + .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, +}; + +static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned len) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + /* TODO Probably use a different field than dev_addr so that the + * mac-address on the virtual device is portable and can be carried + * while the packets use the mac-addr on the physical device. + */ + return dev_hard_header(skb, phy_dev, type, daddr, + saddr ? : dev->dev_addr, len); +} + +static const struct header_ops ipvlan_header_ops = { + .create = ipvlan_hard_header, + .rebuild = eth_rebuild_header, + .parse = eth_header_parse, + .cache = eth_header_cache, + .cache_update = eth_header_cache_update, +}; + +static int ipvlan_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + + return __ethtool_get_settings(ipvlan->phy_dev, cmd); +} + +static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); +} + +static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + + return ipvlan->msg_enable; +} + +static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + ipvlan->msg_enable = value; +} + +static const struct ethtool_ops ipvlan_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_settings = ipvlan_ethtool_get_settings, + .get_drvinfo = ipvlan_ethtool_get_drvinfo, + .get_msglevel = ipvlan_ethtool_get_msglevel, + .set_msglevel = ipvlan_ethtool_set_msglevel, +}; + +static int ipvlan_nl_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); + + if (data && data[IFLA_IPVLAN_MODE]) { + u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + ipvlan_set_port_mode(port, nmode); + } + return 0; +} + +static size_t ipvlan_nl_getsize(const struct net_device *dev) +{ + return (0 + + nla_total_size(2) /* IFLA_IPVLAN_MODE */ + ); +} + +static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (data && data[IFLA_IPVLAN_MODE]) { + u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX) + return -EINVAL; + } + return 0; +} + +static int ipvlan_nl_fillinfo(struct sk_buff *skb, + const struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); + int ret = -EINVAL; + + if (!port) + goto err; + + ret = -EMSGSIZE; + if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) + goto err; + + return 0; + +err: + return ret; +} + +static int ipvlan_link_new(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port; + struct net_device *phy_dev; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!phy_dev) + return -ENODEV; + + if (ipvlan_dev_slave(phy_dev)) { + struct ipvl_dev *tmp = netdev_priv(phy_dev); + + phy_dev = tmp->phy_dev; + } else if (!ipvlan_dev_master(phy_dev)) { + err = ipvlan_port_create(phy_dev); + if (err < 0) + return err; + } + + port = ipvlan_port_get_rtnl(phy_dev); + if (data && data[IFLA_IPVLAN_MODE]) + port->mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + ipvlan->phy_dev = phy_dev; + ipvlan->dev = dev; + ipvlan->port = port; + ipvlan->sfeatures = IPVLAN_FEATURES; + INIT_LIST_HEAD(&ipvlan->addrs); + ipvlan->ipv4cnt = 0; + ipvlan->ipv6cnt = 0; + + /* TODO Probably put random address here to be presented to the + * world but keep using the physical-dev address for the outgoing + * packets. + */ + memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); + + dev->priv_flags |= IFF_IPVLAN_SLAVE; + + port->count += 1; + err = register_netdevice(dev); + if (err < 0) + goto ipvlan_destroy_port; + + err = netdev_upper_dev_link(phy_dev, dev); + if (err) + goto ipvlan_destroy_port; + + list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); + netif_stacked_transfer_operstate(phy_dev, dev); + return 0; + +ipvlan_destroy_port: + port->count -= 1; + if (!port->count) + ipvlan_port_destroy(phy_dev); + + return err; +} + +static void ipvlan_link_delete(struct net_device *dev, struct list_head *head) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_addr *addr, *next; + + if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) { + list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { + ipvlan_ht_addr_del(addr, !dev->dismantle); + list_del_rcu(&addr->anode); + } + } + list_del_rcu(&ipvlan->pnode); + unregister_netdevice_queue(dev, head); + netdev_upper_dev_unlink(ipvlan->phy_dev, dev); +} + +static void ipvlan_link_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags |= IFF_UNICAST_FLT; + dev->netdev_ops = &ipvlan_netdev_ops; + dev->destructor = free_netdev; + dev->header_ops = &ipvlan_header_ops; + dev->ethtool_ops = &ipvlan_ethtool_ops; + dev->tx_queue_len = 0; +} + +static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = +{ + [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, +}; + +static struct rtnl_link_ops ipvlan_link_ops = { + .kind = "ipvlan", + .priv_size = sizeof(struct ipvl_dev), + + .get_size = ipvlan_nl_getsize, + .policy = ipvlan_nl_policy, + .validate = ipvlan_nl_validate, + .fill_info = ipvlan_nl_fillinfo, + .changelink = ipvlan_nl_changelink, + .maxtype = IFLA_IPVLAN_MAX, + + .setup = ipvlan_link_setup, + .newlink = ipvlan_link_new, + .dellink = ipvlan_link_delete, +}; + +int ipvlan_link_register(struct rtnl_link_ops *ops) +{ + return rtnl_link_register(ops); +} + +static int ipvlan_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct ipvl_dev *ipvlan, *next; + struct ipvl_port *port; + LIST_HEAD(lst_kill); + + if (!ipvlan_dev_master(dev)) + return NOTIFY_DONE; + + port = ipvlan_port_get_rtnl(dev); + + switch (event) { + case NETDEV_CHANGE: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + netif_stacked_transfer_operstate(ipvlan->phy_dev, + ipvlan->dev); + break; + + case NETDEV_UNREGISTER: + if (dev->reg_state != NETREG_UNREGISTERING) + break; + + list_for_each_entry_safe(ipvlan, next, &port->ipvlans, + pnode) + ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, + &lst_kill); + unregister_netdevice_many(&lst_kill); + break; + + case NETDEV_FEAT_CHANGE: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + ipvlan->dev->features = dev->features & IPVLAN_FEATURES; + ipvlan->dev->gso_max_size = dev->gso_max_size; + netdev_features_change(ipvlan->dev); + } + break; + + case NETDEV_CHANGEMTU: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + ipvlan_adjust_mtu(ipvlan, dev); + break; + + case NETDEV_PRE_TYPE_CHANGE: + /* Forbid underlying device to change its type. */ + return NOTIFY_BAD; + } + return NOTIFY_DONE; +} + +static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) +{ + struct ipvl_addr *addr; + + if (ipvlan_addr_busy(ipvlan, ip6_addr, true)) { + netif_err(ipvlan, ifup, ipvlan->dev, + "Failed to add IPv6=%pI6c addr for %s intf\n", + ip6_addr, ipvlan->dev->name); + return -EINVAL; + } + addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); + if (!addr) + return -ENOMEM; + + addr->master = ipvlan; + memcpy(&addr->ip6addr, ip6_addr, sizeof(struct in6_addr)); + addr->atype = IPVL_IPV6; + list_add_tail_rcu(&addr->anode, &ipvlan->addrs); + ipvlan->ipv6cnt++; + ipvlan_ht_addr_add(ipvlan, addr); + + return 0; +} + +static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) +{ + struct ipvl_addr *addr; + + addr = ipvlan_ht_addr_lookup(ipvlan->port, ip6_addr, true); + if (!addr) + return; + + ipvlan_ht_addr_del(addr, true); + list_del_rcu(&addr->anode); + ipvlan->ipv6cnt--; + WARN_ON(ipvlan->ipv6cnt < 0); + kfree_rcu(addr, rcu); + + return; +} + +static int ipvlan_addr6_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; + struct net_device *dev = (struct net_device *)if6->idev->dev; + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (!ipvlan_dev_slave(dev)) + return NOTIFY_DONE; + + if (!ipvlan || !ipvlan->port) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + if (ipvlan_add_addr6(ipvlan, &if6->addr)) + return NOTIFY_BAD; + break; + + case NETDEV_DOWN: + ipvlan_del_addr6(ipvlan, &if6->addr); + break; + } + + return NOTIFY_OK; +} + +static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) +{ + struct ipvl_addr *addr; + + if (ipvlan_addr_busy(ipvlan, ip4_addr, false)) { + netif_err(ipvlan, ifup, ipvlan->dev, + "Failed to add IPv4=%pI4 on %s intf.\n", + ip4_addr, ipvlan->dev->name); + return -EINVAL; + } + addr = kzalloc(sizeof(struct ipvl_addr), GFP_KERNEL); + if (!addr) + return -ENOMEM; + + addr->master = ipvlan; + memcpy(&addr->ip4addr, ip4_addr, sizeof(struct in_addr)); + addr->atype = IPVL_IPV4; + list_add_tail_rcu(&addr->anode, &ipvlan->addrs); + ipvlan->ipv4cnt++; + ipvlan_ht_addr_add(ipvlan, addr); + ipvlan_set_broadcast_mac_filter(ipvlan, true); + + return 0; +} + +static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) +{ + struct ipvl_addr *addr; + + addr = ipvlan_ht_addr_lookup(ipvlan->port, ip4_addr, false); + if (!addr) + return; + + ipvlan_ht_addr_del(addr, true); + list_del_rcu(&addr->anode); + ipvlan->ipv4cnt--; + WARN_ON(ipvlan->ipv4cnt < 0); + if (!ipvlan->ipv4cnt) + ipvlan_set_broadcast_mac_filter(ipvlan, false); + kfree_rcu(addr, rcu); + + return; +} + +static int ipvlan_addr4_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; + struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct in_addr ip4_addr; + + if (!ipvlan_dev_slave(dev)) + return NOTIFY_DONE; + + if (!ipvlan || !ipvlan->port) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + ip4_addr.s_addr = if4->ifa_address; + if (ipvlan_add_addr4(ipvlan, &ip4_addr)) + return NOTIFY_BAD; + break; + + case NETDEV_DOWN: + ip4_addr.s_addr = if4->ifa_address; + ipvlan_del_addr4(ipvlan, &ip4_addr); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { + .notifier_call = ipvlan_addr4_event, +}; + +static struct notifier_block ipvlan_notifier_block __read_mostly = { + .notifier_call = ipvlan_device_event, +}; + +static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { + .notifier_call = ipvlan_addr6_event, +}; + +static int __init ipvlan_init_module(void) +{ + int err; + + ipvlan_init_secret(); + register_netdevice_notifier(&ipvlan_notifier_block); + register_inet6addr_notifier(&ipvlan_addr6_notifier_block); + register_inetaddr_notifier(&ipvlan_addr4_notifier_block); + + err = ipvlan_link_register(&ipvlan_link_ops); + if (err < 0) + goto error; + + return 0; +error: + unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); + unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); + unregister_netdevice_notifier(&ipvlan_notifier_block); + return err; +} + +static void __exit ipvlan_cleanup_module(void) +{ + rtnl_link_unregister(&ipvlan_link_ops); + unregister_netdevice_notifier(&ipvlan_notifier_block); + unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); + unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); +} + +module_init(ipvlan_init_module); +module_exit(ipvlan_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mahesh Bandewar "); +MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); +MODULE_ALIAS_RTNL_LINK("ipvlan"); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5cd508787572..2cb772495f7a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1230,6 +1230,8 @@ enum netdev_priv_flags { IFF_LIVE_ADDR_CHANGE = 1<<20, IFF_MACVLAN = 1<<21, IFF_XMIT_DST_RELEASE_PERM = 1<<22, + IFF_IPVLAN_MASTER = 1<<23, + IFF_IPVLAN_SLAVE = 1<<24, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1255,6 +1257,8 @@ enum netdev_priv_flags { #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN #define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM +#define IFF_IPVLAN_MASTER IFF_IPVLAN_MASTER +#define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE /** * struct net_device - The DEVICE structure. diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7072d8325016..36bddc233633 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -330,6 +330,21 @@ enum macvlan_macaddr_mode { #define MACVLAN_FLAG_NOPROMISC 1 +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_MAX +}; + /* VXLAN section */ enum { IFLA_VXLAN_UNSPEC, -- cgit v1.2.3 From 4f0372150b1fbf2167cfe21d2e6eac1933fb36ac Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Mon, 24 Nov 2014 14:24:54 +0100 Subject: tipc: add tipc_netlink.h to uapi Kbuild tipc_netlink.h is the user-space header for the new netlink api. It was accidentally left out of the uapi Kbuild list when the api was added. Signed-off-by: Richard Alpe Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 72298b6887ac..a1e8175cc488 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -389,6 +389,7 @@ header-y += times.h header-y += timex.h header-y += tiocl.h header-y += tipc_config.h +header-y += tipc_netlink.h header-y += tipc.h header-y += toshiba.h header-y += tty_flags.h -- cgit v1.2.3 From f895b252d4edf66b2895fb5a7b17a638665f3e1f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 17 Nov 2014 16:58:04 -0500 Subject: sunrpc: eliminate RPC_DEBUG It's always set to whatever CONFIG_SUNRPC_DEBUG is, so just use that. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- include/linux/sunrpc/auth.h | 2 +- include/linux/sunrpc/debug.h | 9 +++------ include/linux/sunrpc/sched.h | 8 ++++---- include/uapi/linux/nfsd/debug.h | 2 +- net/sunrpc/auth.c | 4 ++-- net/sunrpc/auth_generic.c | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/auth_gss/gss_generic_token.c | 2 +- net/sunrpc/auth_gss/gss_krb5_crypto.c | 2 +- net/sunrpc/auth_gss/gss_krb5_keys.c | 2 +- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 +- net/sunrpc/auth_gss/gss_krb5_seal.c | 2 +- net/sunrpc/auth_gss/gss_krb5_seqnum.c | 2 +- net/sunrpc/auth_gss/gss_krb5_unseal.c | 2 +- net/sunrpc/auth_gss/gss_krb5_wrap.c | 2 +- net/sunrpc/auth_gss/gss_mech_switch.c | 2 +- net/sunrpc/auth_gss/gss_rpc_xdr.h | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/auth_null.c | 4 ++-- net/sunrpc/auth_unix.c | 2 +- net/sunrpc/backchannel_rqst.c | 2 +- net/sunrpc/clnt.c | 6 +++--- net/sunrpc/rpcb_clnt.c | 2 +- net/sunrpc/sched.c | 4 ++-- net/sunrpc/sunrpc_syms.c | 4 ++-- net/sunrpc/svc.c | 2 +- net/sunrpc/sysctl.c | 2 +- net/sunrpc/xprt.c | 2 +- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/transport.c | 8 ++++---- net/sunrpc/xprtrdma/verbs.c | 8 ++++---- net/sunrpc/xprtsock.c | 8 ++++---- 32 files changed, 53 insertions(+), 56 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h index 8e030075fe79..a7cbb570cc5c 100644 --- a/include/linux/sunrpc/auth.h +++ b/include/linux/sunrpc/auth.h @@ -53,7 +53,7 @@ struct rpc_cred { struct rcu_head cr_rcu; struct rpc_auth * cr_auth; const struct rpc_credops *cr_ops; -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) unsigned long cr_magic; /* 0x0f4aa4f0 */ #endif unsigned long cr_expire; /* when to gc */ diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 9385bd74c860..51143757b8f7 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -14,9 +14,6 @@ /* * Enable RPC debugging/profiling. */ -#ifdef CONFIG_SUNRPC_DEBUG -#define RPC_DEBUG -#endif #ifdef CONFIG_TRACEPOINTS #define RPC_TRACEPOINTS #endif @@ -25,7 +22,7 @@ /* * Debugging macros etc */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) extern unsigned int rpc_debug; extern unsigned int nfs_debug; extern unsigned int nfsd_debug; @@ -36,7 +33,7 @@ extern unsigned int nlm_debug; #define dprintk_rcu(args...) dfprintk_rcu(FACILITY, ## args) #undef ifdebug -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) # define dfprintk(fac, args...) \ @@ -65,7 +62,7 @@ extern unsigned int nlm_debug; /* * Sysctl interface for RPC debugging */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) void rpc_register_sysctl(void); void rpc_unregister_sysctl(void); #endif diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 1a8959944c5f..fecdbf1b4797 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -79,7 +79,7 @@ struct rpc_task { unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ -#if defined(RPC_DEBUG) || defined(RPC_TRACEPOINTS) +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || defined(RPC_TRACEPOINTS) unsigned short tk_pid; /* debugging aid */ #endif unsigned char tk_priority : 2,/* Task priority */ @@ -187,7 +187,7 @@ struct rpc_wait_queue { unsigned char nr; /* # tasks remaining for cookie */ unsigned short qlen; /* total # tasks waiting in queue */ struct rpc_timer timer_list; -#if defined(RPC_DEBUG) || defined(RPC_TRACEPOINTS) +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || defined(RPC_TRACEPOINTS) const char * name; #endif }; @@ -237,7 +237,7 @@ void rpc_free(void *); int rpciod_up(void); void rpciod_down(void); int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct net; void rpc_show_tasks(struct net *); #endif @@ -251,7 +251,7 @@ static inline int rpc_wait_for_completion_task(struct rpc_task *task) return __rpc_wait_for_completion_task(task, NULL); } -#if defined(RPC_DEBUG) || defined (RPC_TRACEPOINTS) +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || defined (RPC_TRACEPOINTS) static inline const char * rpc_qname(const struct rpc_wait_queue *q) { return ((q && q->name) ? q->name : "unknown"); diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h index a6f453c740b8..1fdc95bb2375 100644 --- a/include/uapi/linux/nfsd/debug.h +++ b/include/uapi/linux/nfsd/debug.h @@ -15,7 +15,7 @@ * Enable debugging for nfsd. * Requires RPC_DEBUG. */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define NFSD_DEBUG 1 #endif diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 383eb919ac0b..47f38be4155f 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -16,7 +16,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif @@ -646,7 +646,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) cred->cr_magic = RPCAUTH_CRED_MAGIC; #endif cred->cr_uid = acred->uid; diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 6f6b829c9e8e..41248b1820c7 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -14,7 +14,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 53ed8d3f8897..dace13d7638e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -66,7 +66,7 @@ static unsigned int gss_expired_cred_retry_delay = GSS_RETRY_EXPIRED; #define GSS_KEY_EXPIRE_TIMEO 240 static unsigned int gss_key_expire_timeo = GSS_KEY_EXPIRE_TIMEO; -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c index c586e92bcf76..254defe446a7 100644 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -38,7 +38,7 @@ #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index f5ed9f6ece06..b5408e8a37f2 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -45,7 +45,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c index 24589bd2a4b6..234fa8d0fd9b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_keys.c +++ b/net/sunrpc/auth_gss/gss_krb5_keys.c @@ -61,7 +61,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 0d3c158ef8fa..28db442a0034 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -45,7 +45,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 42768e5c3994..1d74d653e6c0 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -64,7 +64,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index 62ac90c62cb1..20d55c793eb6 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -35,7 +35,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 6c981ddc19f8..dcf9515d9aef 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -62,7 +62,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 4b614c604fe0..ca7e92a32f84 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -35,7 +35,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 92d5ab99fbf3..7063d856a598 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -46,7 +46,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.h b/net/sunrpc/auth_gss/gss_rpc_xdr.h index 685a688f3d8a..9d88c6239f01 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.h +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.h @@ -25,7 +25,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index c548ab213f76..de856ddf5fed 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -51,7 +51,7 @@ #include "gss_rpc_upcall.h" -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 712c123e04e9..c2a2b584a056 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -10,7 +10,7 @@ #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif @@ -138,7 +138,7 @@ struct rpc_cred null_cred = { .cr_ops = &null_credops, .cr_count = ATOMIC_INIT(1), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) .cr_magic = RPCAUTH_CRED_MAGIC, #endif }; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index d5d692366294..4feda2d0a833 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -25,7 +25,7 @@ struct unx_cred { #define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 9761a0da964d..651f49ab601f 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -27,7 +27,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) #define RPCDBG_FACILITY RPCDBG_TRANS #endif diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9acd6ce88db7..36c64ef460cf 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -42,7 +42,7 @@ #include "sunrpc.h" #include "netns.h" -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_CALL #endif @@ -1396,7 +1396,7 @@ rpc_restart_call(struct rpc_task *task) } EXPORT_SYMBOL_GPL(rpc_restart_call); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char *rpc_proc_name(const struct rpc_task *task) { const struct rpc_procinfo *proc = task->tk_msg.rpc_proc; @@ -2421,7 +2421,7 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int } EXPORT_SYMBOL_GPL(rpc_call_null); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static void rpc_show_header(void) { printk(KERN_INFO "-pid- flgs status -client- --rqstp- " diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 1891a1022c17..05202012bcfc 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -32,7 +32,7 @@ #include "netns.h" -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_BIND #endif diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index fe3441abdbe5..574b2977fc4b 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -24,7 +24,7 @@ #include "sunrpc.h" -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) #define RPCDBG_FACILITY RPCDBG_SCHED #endif @@ -258,7 +258,7 @@ static int rpc_wait_bit_killable(struct wait_bit_key *key) return 0; } -#if defined(RPC_DEBUG) || defined(RPC_TRACEPOINTS) +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || defined(RPC_TRACEPOINTS) static void rpc_task_set_debuginfo(struct rpc_task *task) { static atomic_t rpc_pid; diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index cd30120de9e4..f632e476ab6c 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -97,7 +97,7 @@ init_sunrpc(void) err = register_rpc_pipefs(); if (err) goto out4; -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif svc_init_xprt_sock(); /* svc sock transport */ @@ -123,7 +123,7 @@ cleanup_sunrpc(void) unregister_rpc_pipefs(); rpc_destroy_mempool(); unregister_pernet_subsys(&sunrpc_net_ops); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_unregister_sysctl(); #endif rcu_barrier(); /* Wait for completion of call_rcu()'s */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 371a8bbb43d6..2783fd80c229 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1042,7 +1042,7 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) /* * dprintk the given error with the address of the client that caused it. */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static __printf(2, 3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) { diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index c99c58e2ee66..887f0183b4c6 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -37,7 +37,7 @@ EXPORT_SYMBOL_GPL(nfsd_debug); unsigned int nlm_debug; EXPORT_SYMBOL_GPL(nlm_debug); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static struct ctl_table_header *sunrpc_table_header; static struct ctl_table sunrpc_table[]; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1b2e5e616cae..894d071426b2 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -57,7 +57,7 @@ * Local variables */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_XPRT #endif diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 6166c985fe24..df01d124936c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -49,11 +49,11 @@ #include -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS #endif -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ " read chunk", /* some argument via rdma read */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6a4615dd0261..ef58ebadb3ae 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -55,7 +55,7 @@ #include "xprt_rdma.h" -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS #endif @@ -75,7 +75,7 @@ static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; int xprt_rdma_pad_optimize = 0; -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; @@ -705,7 +705,7 @@ static void __exit xprt_rdma_cleanup(void) int rc; dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; @@ -736,7 +736,7 @@ static int __init xprt_rdma_init(void) dprintk("\tPadding %d\n\tMemreg %d\n", xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) sunrpc_table_header = register_sysctl_table(sunrpc_table); #endif diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 61c41298b4ea..b92b04083e40 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -57,7 +57,7 @@ * Globals/Macros */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_TRANS #endif @@ -313,7 +313,7 @@ rpcrdma_flush_cqs(struct rpcrdma_ep *ep) rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); } -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char * const conn[] = { "address resolved", "address error", @@ -344,7 +344,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_xprt *xprt = id->context; struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; #endif struct ib_qp_attr attr; @@ -408,7 +408,7 @@ connected: break; } -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (connstate == 1) { int ird = attr.max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 31c015196a29..87ce7e8bb8dc 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -75,7 +75,7 @@ static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; * someone else's file names! */ -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; @@ -186,7 +186,7 @@ static struct ctl_table sunrpc_table[] = { */ #define XS_IDLE_DISC_TO (5U * 60 * HZ) -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS #endif @@ -2991,7 +2991,7 @@ static struct xprt_class xs_bc_tcp_transport = { */ int init_socket_xprt(void) { -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) sunrpc_table_header = register_sysctl_table(sunrpc_table); #endif @@ -3010,7 +3010,7 @@ int init_socket_xprt(void) */ void cleanup_socket_xprt(void) { -#ifdef RPC_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; -- cgit v1.2.3 From 98f0334263f177dd22ca7c685cde04b47cc57b05 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 26 Nov 2014 12:42:02 +0100 Subject: cfg80211: clean up beacon loss CQM event Having it as a sub-event for RSSI thresholds is very ugly, but luckily no userspace actually uses the events yet. Move the event to its own function call internally and to its own event attribute in nl80211. Signed-off-by: Johannes Berg --- drivers/net/wireless/ti/wlcore/event.c | 5 +---- include/net/cfg80211.h | 9 +++++++++ include/net/mac80211.h | 8 ++++++++ include/uapi/linux/nl80211.h | 7 ++++--- net/mac80211/mlme.c | 14 +++++++++++--- net/mac80211/trace.h | 6 ++++++ net/wireless/nl80211.c | 23 +++++++++++++++++++++++ 7 files changed, 62 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/wireless/ti/wlcore/event.c b/drivers/net/wireless/ti/wlcore/event.c index 16d10281798d..5153640f4532 100644 --- a/drivers/net/wireless/ti/wlcore/event.c +++ b/drivers/net/wireless/ti/wlcore/event.c @@ -259,10 +259,7 @@ void wlcore_event_beacon_loss(struct wl1271 *wl, unsigned long roles_bitmap) &wlvif->connection_loss_work, msecs_to_jiffies(delay)); - ieee80211_cqm_rssi_notify( - vif, - NL80211_CQM_RSSI_BEACON_LOSS_EVENT, - GFP_KERNEL); + ieee80211_cqm_beacon_loss_notify(vif, GFP_KERNEL); } } EXPORT_SYMBOL_GPL(wlcore_event_beacon_loss); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1d15f1dfdaa7..4ebb816241fa 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4669,6 +4669,15 @@ void cfg80211_cqm_pktloss_notify(struct net_device *dev, void cfg80211_cqm_txe_notify(struct net_device *dev, const u8 *peer, u32 num_packets, u32 rate, u32 intvl, gfp_t gfp); +/** + * cfg80211_cqm_beacon_loss_notify - beacon loss event + * @dev: network device + * @gfp: context flags + * + * Notify userspace about beacon loss from the connected AP. + */ +void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp); + /** * cfg80211_radar_event - radar detection event * @wiphy: the wiphy diff --git a/include/net/mac80211.h b/include/net/mac80211.h index cff3a26a9dae..66cbfe46428d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4671,6 +4671,14 @@ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, enum nl80211_cqm_rssi_threshold_event rssi_event, gfp_t gfp); +/** + * ieee80211_cqm_beacon_loss_notify - inform CQM of beacon loss + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @gfp: context flags + */ +void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp); + /** * ieee80211_radar_detected - inform that a radar was detected * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d77524510435..b37bd5a1cb82 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3451,6 +3451,8 @@ enum nl80211_ps_state { * interval in which %NL80211_ATTR_CQM_TXE_PKTS and * %NL80211_ATTR_CQM_TXE_RATE must be satisfied before generating an * %NL80211_CMD_NOTIFY_CQM. Set to 0 to turn off TX error reporting. + * @NL80211_ATTR_CQM_BEACON_LOSS_EVENT: flag attribute that's set in a beacon + * loss event * @__NL80211_ATTR_CQM_AFTER_LAST: internal * @NL80211_ATTR_CQM_MAX: highest key attribute */ @@ -3463,6 +3465,7 @@ enum nl80211_attr_cqm { NL80211_ATTR_CQM_TXE_RATE, NL80211_ATTR_CQM_TXE_PKTS, NL80211_ATTR_CQM_TXE_INTVL, + NL80211_ATTR_CQM_BEACON_LOSS_EVENT, /* keep last */ __NL80211_ATTR_CQM_AFTER_LAST, @@ -3475,9 +3478,7 @@ enum nl80211_attr_cqm { * configured threshold * @NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH: The RSSI is higher than the * configured threshold - * @NL80211_CQM_RSSI_BEACON_LOSS_EVENT: The device experienced beacon loss. - * (Note that deauth/disassoc will still follow if the AP is not - * available. This event might get used as roaming event, etc.) + * @NL80211_CQM_RSSI_BEACON_LOSS_EVENT: (reserved, never sent) */ enum nl80211_cqm_rssi_threshold_event { NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 36e39ed4dfd9..8a23a64b9a61 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2266,9 +2266,7 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, "detected beacon loss from AP (missed %d beacons) - probing\n", beacon_loss_count); - ieee80211_cqm_rssi_notify(&sdata->vif, - NL80211_CQM_RSSI_BEACON_LOSS_EVENT, - GFP_KERNEL); + ieee80211_cqm_beacon_loss_notify(&sdata->vif, GFP_KERNEL); } /* @@ -4901,3 +4899,13 @@ void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, cfg80211_cqm_rssi_notify(sdata->dev, rssi_event, gfp); } EXPORT_SYMBOL(ieee80211_cqm_rssi_notify); + +void ieee80211_cqm_beacon_loss_notify(struct ieee80211_vif *vif, gfp_t gfp) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + trace_api_cqm_beacon_loss_notify(sdata->local, sdata); + + cfg80211_cqm_beacon_loss_notify(sdata->dev, gfp); +} +EXPORT_SYMBOL(ieee80211_cqm_beacon_loss_notify); diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 85ccfbe863db..8e461a02c6a8 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1829,6 +1829,12 @@ TRACE_EVENT(api_cqm_rssi_notify, ) ); +DEFINE_EVENT(local_sdata_evt, api_cqm_beacon_loss_notify, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata), + TP_ARGS(local, sdata) +); + TRACE_EVENT(api_scan_completed, TP_PROTO(struct ieee80211_local *local, bool aborted), diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9c4d0102d34d..e11980e74a04 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -11826,6 +11826,10 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, trace_cfg80211_cqm_rssi_notify(dev, rssi_event); + if (WARN_ON(rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW && + rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) + return; + msg = cfg80211_prepare_cqm(dev, NULL, gfp); if (!msg) return; @@ -11892,6 +11896,25 @@ void cfg80211_cqm_pktloss_notify(struct net_device *dev, } EXPORT_SYMBOL(cfg80211_cqm_pktloss_notify); +void cfg80211_cqm_beacon_loss_notify(struct net_device *dev, gfp_t gfp) +{ + struct sk_buff *msg; + + msg = cfg80211_prepare_cqm(dev, NULL, gfp); + if (!msg) + return; + + if (nla_put_flag(msg, NL80211_ATTR_CQM_BEACON_LOSS_EVENT)) + goto nla_put_failure; + + cfg80211_send_cqm(msg, gfp); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_cqm_beacon_loss_notify); + static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, const u8 *replay_ctr, gfp_t gfp) -- cgit v1.2.3 From 766a85d7bc5d7f1ddd6de28bdb844eae45ec63b0 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 28 Nov 2014 05:26:34 +0000 Subject: arm64: ptrace: add NT_ARM_SYSTEM_CALL regset This regeset is intended to be used to get and set a system call number while tracing. There was some discussion about possible approaches to do so: (1) modify x8 register with ptrace(PTRACE_SETREGSET) indirectly, and update regs->syscallno later on in syscall_trace_enter(), or (2) define a dedicated regset for this purpose as on s390, or (3) support ptrace(PTRACE_SET_SYSCALL) as on arch/arm Thinking of the fact that user_pt_regs doesn't expose 'syscallno' to tracer as well as that secure_computing() expects a changed syscall number, especially case of -1, to be visible before this function returns in syscall_trace_enter(), (1) doesn't work well. We will take (2) since it looks much cleaner. Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 35 +++++++++++++++++++++++++++++++++++ include/uapi/linux/elf.h | 1 + 2 files changed, 36 insertions(+) (limited to 'include/uapi') diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 8a4ae8e73213..f576781d8d3b 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -551,6 +551,32 @@ static int tls_set(struct task_struct *target, const struct user_regset *regset, return ret; } +static int system_call_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int syscallno = task_pt_regs(target)->syscallno; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &syscallno, 0, -1); +} + +static int system_call_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int syscallno, ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &syscallno, 0, -1); + if (ret) + return ret; + + task_pt_regs(target)->syscallno = syscallno; + return ret; +} + enum aarch64_regset { REGSET_GPR, REGSET_FPR, @@ -559,6 +585,7 @@ enum aarch64_regset { REGSET_HW_BREAK, REGSET_HW_WATCH, #endif + REGSET_SYSTEM_CALL, }; static const struct user_regset aarch64_regsets[] = { @@ -608,6 +635,14 @@ static const struct user_regset aarch64_regsets[] = { .set = hw_break_set, }, #endif + [REGSET_SYSTEM_CALL] = { + .core_note_type = NT_ARM_SYSTEM_CALL, + .n = 1, + .size = sizeof(int), + .align = sizeof(int), + .get = system_call_get, + .set = system_call_set, + }, }; static const struct user_regset_view user_aarch64_view = { diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index ea9bf2561b9e..71e1d0ed92f7 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -397,6 +397,7 @@ typedef struct elf64_shdr { #define NT_ARM_TLS 0x401 /* ARM TLS register */ #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ #define NT_ARM_HW_WATCH 0x403 /* ARM hardware watchpoint registers */ +#define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ #define NT_METAG_CBUF 0x500 /* Metag catch buffer registers */ #define NT_METAG_RPIPE 0x501 /* Metag read pipeline state */ #define NT_METAG_TLS 0x502 /* Metag TLS pointer */ -- cgit v1.2.3 From a99903ec4566eeeaaaf611499cae00abbe844938 Mon Sep 17 00:00:00 2001 From: Julien Lefrique Date: Tue, 21 Oct 2014 16:52:46 +0200 Subject: NFC: NCI: Handle Target mode activation Changes: * Extract the Listen mode activation parameters from RF_INTF_ACTIVATED_NTF. * Store the General Bytes of ATR_REQ. * Signal that Target mode is activated in case of an activation in NFC-DEP. * Update the NCI state accordingly. * Use the various constants defined in nfc.h. * Fix the ATR_REQ and ATR_RES maximum size. As per NCI 1.0 and NCI 1.1, the Activation Parameters for both Poll and Listen mode contain all the bytes of ATR_REQ/ATR_RES starting and including Byte 3 as defined in [DIGITAL]. In [DIGITAL], the maximum size of ATR_REQ/ATR_RES is 64 bytes and they are numbered starting from Byte 1. Signed-off-by: Julien Lefrique Signed-off-by: Samuel Ortiz --- include/net/nfc/nci.h | 26 +++++++-- include/net/nfc/nci_core.h | 3 + include/net/nfc/nfc.h | 2 + include/uapi/linux/nfc.h | 20 ++++--- net/nfc/nci/ntf.c | 133 +++++++++++++++++++++++++++++++++++++-------- 5 files changed, 146 insertions(+), 38 deletions(-) (limited to 'include/uapi') diff --git a/include/net/nfc/nci.h b/include/net/nfc/nci.h index 36cf65386b86..fffadc706e06 100644 --- a/include/net/nfc/nci.h +++ b/include/net/nfc/nci.h @@ -28,6 +28,8 @@ #ifndef __NCI_H #define __NCI_H +#include + /* NCI constants */ #define NCI_MAX_NUM_MAPPING_CONFIGS 10 #define NCI_MAX_NUM_RF_CONFIGS 10 @@ -73,6 +75,8 @@ #define NCI_NFC_A_ACTIVE_LISTEN_MODE 0x83 #define NCI_NFC_F_ACTIVE_LISTEN_MODE 0x85 +#define NCI_RF_TECH_MODE_LISTEN_MASK 0x80 + /* NCI RF Technologies */ #define NCI_NFC_RF_TECHNOLOGY_A 0x00 #define NCI_NFC_RF_TECHNOLOGY_B 0x01 @@ -324,26 +328,31 @@ struct nci_core_intf_error_ntf { struct rf_tech_specific_params_nfca_poll { __u16 sens_res; __u8 nfcid1_len; /* 0, 4, 7, or 10 Bytes */ - __u8 nfcid1[10]; + __u8 nfcid1[NFC_NFCID1_MAXSIZE]; __u8 sel_res_len; /* 0 or 1 Bytes */ __u8 sel_res; } __packed; struct rf_tech_specific_params_nfcb_poll { __u8 sensb_res_len; - __u8 sensb_res[12]; /* 11 or 12 Bytes */ + __u8 sensb_res[NFC_SENSB_RES_MAXSIZE]; /* 11 or 12 Bytes */ } __packed; struct rf_tech_specific_params_nfcf_poll { __u8 bit_rate; __u8 sensf_res_len; - __u8 sensf_res[18]; /* 16 or 18 Bytes */ + __u8 sensf_res[NFC_SENSF_RES_MAXSIZE]; /* 16 or 18 Bytes */ } __packed; struct rf_tech_specific_params_nfcv_poll { __u8 res_flags; __u8 dsfid; - __u8 uid[8]; /* 8 Bytes */ + __u8 uid[NFC_ISO15693_UID_MAXSIZE]; /* 8 Bytes */ +} __packed; + +struct rf_tech_specific_params_nfcf_listen { + __u8 local_nfcid2_len; + __u8 local_nfcid2[NFC_NFCID2_MAXSIZE]; /* 0 or 8 Bytes */ } __packed; struct nci_rf_discover_ntf { @@ -375,7 +384,12 @@ struct activation_params_nfcb_poll_iso_dep { struct activation_params_poll_nfc_dep { __u8 atr_res_len; - __u8 atr_res[63]; + __u8 atr_res[NFC_ATR_RES_MAXSIZE - 2]; /* ATR_RES from byte 3 */ +}; + +struct activation_params_listen_nfc_dep { + __u8 atr_req_len; + __u8 atr_req[NFC_ATR_REQ_MAXSIZE - 2]; /* ATR_REQ from byte 3 */ }; struct nci_rf_intf_activated_ntf { @@ -392,6 +406,7 @@ struct nci_rf_intf_activated_ntf { struct rf_tech_specific_params_nfcb_poll nfcb_poll; struct rf_tech_specific_params_nfcf_poll nfcf_poll; struct rf_tech_specific_params_nfcv_poll nfcv_poll; + struct rf_tech_specific_params_nfcf_listen nfcf_listen; } rf_tech_specific_params; __u8 data_exch_rf_tech_and_mode; @@ -403,6 +418,7 @@ struct nci_rf_intf_activated_ntf { struct activation_params_nfca_poll_iso_dep nfca_poll_iso_dep; struct activation_params_nfcb_poll_iso_dep nfcb_poll_iso_dep; struct activation_params_poll_nfc_dep poll_nfc_dep; + struct activation_params_listen_nfc_dep listen_nfc_dep; } activation_params; } __packed; diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 75d10e625c49..cfea60748a39 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -4,6 +4,7 @@ * * Copyright (C) 2011 Texas Instruments, Inc. * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2014 Marvell International Ltd. * * Written by Ilan Elias * @@ -49,6 +50,8 @@ enum nci_state { NCI_W4_ALL_DISCOVERIES, NCI_W4_HOST_SELECT, NCI_POLL_ACTIVE, + NCI_LISTEN_ACTIVE, + NCI_LISTEN_SLEEP, }; /* NCI timeouts */ diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 6c583e244de2..12adb817c27a 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2011 Instituto Nokia de Tecnologia + * Copyright (C) 2014 Marvell International Ltd. * * Authors: * Lauro Ramos Venancio @@ -87,6 +88,7 @@ struct nfc_ops { #define NFC_TARGET_IDX_ANY -1 #define NFC_MAX_GT_LEN 48 #define NFC_ATR_RES_GT_OFFSET 15 +#define NFC_ATR_REQ_GT_OFFSET 14 /** * struct nfc_target - NFC target descriptiom diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 9b19b4461928..19a75daac14c 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -196,15 +196,17 @@ enum nfc_sdp_attr { }; #define NFC_SDP_ATTR_MAX (__NFC_SDP_ATTR_AFTER_LAST - 1) -#define NFC_DEVICE_NAME_MAXSIZE 8 -#define NFC_NFCID1_MAXSIZE 10 -#define NFC_NFCID2_MAXSIZE 8 -#define NFC_NFCID3_MAXSIZE 10 -#define NFC_SENSB_RES_MAXSIZE 12 -#define NFC_SENSF_RES_MAXSIZE 18 -#define NFC_GB_MAXSIZE 48 -#define NFC_FIRMWARE_NAME_MAXSIZE 32 -#define NFC_ISO15693_UID_MAXSIZE 8 +#define NFC_DEVICE_NAME_MAXSIZE 8 +#define NFC_NFCID1_MAXSIZE 10 +#define NFC_NFCID2_MAXSIZE 8 +#define NFC_NFCID3_MAXSIZE 10 +#define NFC_SENSB_RES_MAXSIZE 12 +#define NFC_SENSF_RES_MAXSIZE 18 +#define NFC_ATR_REQ_MAXSIZE 64 +#define NFC_ATR_RES_MAXSIZE 64 +#define NFC_GB_MAXSIZE 48 +#define NFC_FIRMWARE_NAME_MAXSIZE 32 +#define NFC_ISO15693_UID_MAXSIZE 8 /* NFC protocols */ #define NFC_PROTO_JEWEL 1 diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 205b35f666db..46b2a90ac55a 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -167,6 +167,18 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, return data; } +static __u8 *nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_listen *nfcf_listen, + __u8 *data) +{ + nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, + NFC_NFCID2_MAXSIZE); + memcpy(nfcf_listen->local_nfcid2, data, nfcf_listen->local_nfcid2_len); + data += nfcf_listen->local_nfcid2_len; + + return data; +} + __u32 nci_get_prop_rf_protocol(struct nci_dev *ndev, __u8 rf_protocol) { if (ndev->ops->get_rfprotocol) @@ -401,17 +413,29 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, struct nci_rf_intf_activated_ntf *ntf, __u8 *data) { struct activation_params_poll_nfc_dep *poll; + struct activation_params_listen_nfc_dep *listen; switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: case NCI_NFC_F_PASSIVE_POLL_MODE: poll = &ntf->activation_params.poll_nfc_dep; - poll->atr_res_len = min_t(__u8, *data++, 63); + poll->atr_res_len = min_t(__u8, *data++, + NFC_ATR_RES_MAXSIZE - 2); pr_debug("atr_res_len %d\n", poll->atr_res_len); if (poll->atr_res_len > 0) memcpy(poll->atr_res, data, poll->atr_res_len); break; + case NCI_NFC_A_PASSIVE_LISTEN_MODE: + case NCI_NFC_F_PASSIVE_LISTEN_MODE: + listen = &ntf->activation_params.listen_nfc_dep; + listen->atr_req_len = min_t(__u8, *data++, + NFC_ATR_REQ_MAXSIZE - 2); + pr_debug("atr_req_len %d\n", listen->atr_req_len); + if (listen->atr_req_len > 0) + memcpy(listen->atr_req, data, listen->atr_req_len); + break; + default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf->activation_rf_tech_and_mode); @@ -444,6 +468,50 @@ static void nci_target_auto_activated(struct nci_dev *ndev, nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets); } +static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, + struct nci_rf_intf_activated_ntf *ntf) +{ + ndev->remote_gb_len = 0; + + if (ntf->activation_params_len <= 0) + return NCI_STATUS_OK; + + switch (ntf->activation_rf_tech_and_mode) { + case NCI_NFC_A_PASSIVE_POLL_MODE: + case NCI_NFC_F_PASSIVE_POLL_MODE: + /* ATR_RES general bytes at offset 15 */ + ndev->remote_gb_len = min_t(__u8, + (ntf->activation_params.poll_nfc_dep.atr_res_len + - NFC_ATR_RES_GT_OFFSET), + NFC_MAX_GT_LEN); + memcpy(ndev->remote_gb, + (ntf->activation_params.poll_nfc_dep .atr_res + + NFC_ATR_RES_GT_OFFSET), + ndev->remote_gb_len); + break; + + case NCI_NFC_A_PASSIVE_LISTEN_MODE: + case NCI_NFC_F_PASSIVE_LISTEN_MODE: + /* ATR_REQ general bytes at offset 14 */ + ndev->remote_gb_len = min_t(__u8, + (ntf->activation_params.listen_nfc_dep.atr_req_len + - NFC_ATR_REQ_GT_OFFSET), + NFC_MAX_GT_LEN); + memcpy(ndev->remote_gb, + (ntf->activation_params.listen_nfc_dep.atr_req + + NFC_ATR_REQ_GT_OFFSET), + ndev->remote_gb_len); + break; + + default: + pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", + ntf->activation_rf_tech_and_mode); + return NCI_STATUS_RF_PROTOCOL_ERROR; + } + + return NCI_STATUS_OK; +} + static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { @@ -493,6 +561,16 @@ static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, &(ntf.rf_tech_specific_params.nfcv_poll), data); break; + case NCI_NFC_A_PASSIVE_LISTEN_MODE: + /* no RF technology specific parameters */ + break; + + case NCI_NFC_F_PASSIVE_LISTEN_MODE: + data = nci_extract_rf_params_nfcf_passive_listen(ndev, + &(ntf.rf_tech_specific_params.nfcf_listen), + data); + break; + default: pr_err("unsupported activation_rf_tech_and_mode 0x%x\n", ntf.activation_rf_tech_and_mode); @@ -546,32 +624,39 @@ exit: /* store general bytes to be reported later in dep_link_up */ if (ntf.rf_interface == NCI_RF_INTERFACE_NFC_DEP) { - ndev->remote_gb_len = 0; - - if (ntf.activation_params_len > 0) { - /* ATR_RES general bytes at offset 15 */ - ndev->remote_gb_len = min_t(__u8, - (ntf.activation_params - .poll_nfc_dep.atr_res_len - - NFC_ATR_RES_GT_OFFSET), - NFC_MAX_GT_LEN); - memcpy(ndev->remote_gb, - (ntf.activation_params.poll_nfc_dep - .atr_res + NFC_ATR_RES_GT_OFFSET), - ndev->remote_gb_len); - } + err = nci_store_general_bytes_nfc_dep(ndev, &ntf); + if (err != NCI_STATUS_OK) + pr_err("unable to store general bytes\n"); } } - if (atomic_read(&ndev->state) == NCI_DISCOVERY) { - /* A single target was found and activated automatically */ - atomic_set(&ndev->state, NCI_POLL_ACTIVE); - if (err == NCI_STATUS_OK) - nci_target_auto_activated(ndev, &ntf); - } else { /* ndev->state == NCI_W4_HOST_SELECT */ - /* A selected target was activated, so complete the request */ - atomic_set(&ndev->state, NCI_POLL_ACTIVE); - nci_req_complete(ndev, err); + if (!(ntf.activation_rf_tech_and_mode & NCI_RF_TECH_MODE_LISTEN_MASK)) { + /* Poll mode */ + if (atomic_read(&ndev->state) == NCI_DISCOVERY) { + /* A single target was found and activated + * automatically */ + atomic_set(&ndev->state, NCI_POLL_ACTIVE); + if (err == NCI_STATUS_OK) + nci_target_auto_activated(ndev, &ntf); + } else { /* ndev->state == NCI_W4_HOST_SELECT */ + /* A selected target was activated, so complete the + * request */ + atomic_set(&ndev->state, NCI_POLL_ACTIVE); + nci_req_complete(ndev, err); + } + } else { + /* Listen mode */ + atomic_set(&ndev->state, NCI_LISTEN_ACTIVE); + if (err == NCI_STATUS_OK && + ntf.rf_protocol == NCI_RF_PROTOCOL_NFC_DEP) { + err = nfc_tm_activated(ndev->nfc_dev, + NFC_PROTO_NFC_DEP_MASK, + NFC_COMM_PASSIVE, + ndev->remote_gb, + ndev->remote_gb_len); + if (err != NCI_STATUS_OK) + pr_err("error when signaling tm activation\n"); + } } } -- cgit v1.2.3 From d5735d64e1cebb3748e64f926fa7c35ffabd7afa Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 17 Nov 2014 09:06:42 -0300 Subject: [media] videodev2.h: improve colorspace support Add support for the new AdobeRGB and BT.2020 colorspaces as needed for HDMI 2.0. Add support to specify the Y'CbCr encoding and quantization range explicitly. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 99 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 10 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 1c2f84fd4d99..ced659e1b9f6 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -178,30 +178,103 @@ enum v4l2_memory { /* see also http://vektor.theorem.ca/graphics/ycbcr/ */ enum v4l2_colorspace { - /* ITU-R 601 -- broadcast NTSC/PAL */ + /* SMPTE 170M: used for broadcast NTSC/PAL SDTV */ V4L2_COLORSPACE_SMPTE170M = 1, - /* 1125-Line (US) HDTV */ + /* Obsolete pre-1998 SMPTE 240M HDTV standard, superseded by Rec 709 */ V4L2_COLORSPACE_SMPTE240M = 2, - /* HD and modern captures. */ + /* Rec.709: used for HDTV */ V4L2_COLORSPACE_REC709 = 3, - /* broken BT878 extents (601, luma range 16-253 instead of 16-235) */ + /* + * Deprecated, do not use. No driver will ever return this. This was + * based on a misunderstanding of the bt878 datasheet. + */ V4L2_COLORSPACE_BT878 = 4, - /* These should be useful. Assume 601 extents. */ + /* + * NTSC 1953 colorspace. This only makes sense when dealing with + * really, really old NTSC recordings. Superseded by SMPTE 170M. + */ V4L2_COLORSPACE_470_SYSTEM_M = 5, + + /* + * EBU Tech 3213 PAL/SECAM colorspace. This only makes sense when + * dealing with really old PAL/SECAM recordings. Superseded by + * SMPTE 170M. + */ V4L2_COLORSPACE_470_SYSTEM_BG = 6, - /* I know there will be cameras that send this. So, this is - * unspecified chromaticities and full 0-255 on each of the - * Y'CbCr components + /* + * Effectively shorthand for V4L2_COLORSPACE_SRGB, V4L2_YCBCR_ENC_601 + * and V4L2_QUANTIZATION_FULL_RANGE. To be used for (Motion-)JPEG. */ V4L2_COLORSPACE_JPEG = 7, - /* For RGB colourspaces, this is probably a good start. */ + /* For RGB colorspaces such as produces by most webcams. */ V4L2_COLORSPACE_SRGB = 8, + + /* AdobeRGB colorspace */ + V4L2_COLORSPACE_ADOBERGB = 9, + + /* BT.2020 colorspace, used for UHDTV. */ + V4L2_COLORSPACE_BT2020 = 10, +}; + +enum v4l2_ycbcr_encoding { + /* + * Mapping of V4L2_YCBCR_ENC_DEFAULT to actual encodings for the + * various colorspaces: + * + * V4L2_COLORSPACE_SMPTE170M, V4L2_COLORSPACE_470_SYSTEM_M, + * V4L2_COLORSPACE_470_SYSTEM_BG, V4L2_COLORSPACE_ADOBERGB and + * V4L2_COLORSPACE_JPEG: V4L2_YCBCR_ENC_601 + * + * V4L2_COLORSPACE_REC709: V4L2_YCBCR_ENC_709 + * + * V4L2_COLORSPACE_SRGB: V4L2_YCBCR_ENC_SYCC + * + * V4L2_COLORSPACE_BT2020: V4L2_YCBCR_ENC_BT2020 + * + * V4L2_COLORSPACE_SMPTE240M: V4L2_YCBCR_ENC_SMPTE240M + */ + V4L2_YCBCR_ENC_DEFAULT = 0, + + /* ITU-R 601 -- SDTV */ + V4L2_YCBCR_ENC_601 = 1, + + /* Rec. 709 -- HDTV */ + V4L2_YCBCR_ENC_709 = 2, + + /* ITU-R 601/EN 61966-2-4 Extended Gamut -- SDTV */ + V4L2_YCBCR_ENC_XV601 = 3, + + /* Rec. 709/EN 61966-2-4 Extended Gamut -- HDTV */ + V4L2_YCBCR_ENC_XV709 = 4, + + /* sYCC (Y'CbCr encoding of sRGB) */ + V4L2_YCBCR_ENC_SYCC = 5, + + /* BT.2020 Non-constant Luminance Y'CbCr */ + V4L2_YCBCR_ENC_BT2020 = 6, + + /* BT.2020 Constant Luminance Y'CbcCrc */ + V4L2_YCBCR_ENC_BT2020_CONST_LUM = 7, + + /* SMPTE 240M -- Obsolete HDTV */ + V4L2_YCBCR_ENC_SMPTE240M = 8, +}; + +enum v4l2_quantization { + /* + * The default for R'G'B' quantization is always full range. For + * Y'CbCr the quantization is always limited range, except for + * SYCC, XV601, XV709 or JPEG: those are full range. + */ + V4L2_QUANTIZATION_DEFAULT = 0, + V4L2_QUANTIZATION_FULL_RANGE = 1, + V4L2_QUANTIZATION_LIM_RANGE = 2, }; enum v4l2_priority { @@ -294,6 +367,8 @@ struct v4l2_pix_format { __u32 colorspace; /* enum v4l2_colorspace */ __u32 priv; /* private data, depends on pixelformat */ __u32 flags; /* format flags (V4L2_PIX_FMT_FLAG_*) */ + __u32 ycbcr_enc; /* enum v4l2_ycbcr_encoding */ + __u32 quantization; /* enum v4l2_quantization */ }; /* Pixel format FOURCC depth Description */ @@ -1777,6 +1852,8 @@ struct v4l2_plane_pix_format { * @plane_fmt: per-plane information * @num_planes: number of planes for this format * @flags: format flags (V4L2_PIX_FMT_FLAG_*) + * @ycbcr_enc: enum v4l2_ycbcr_encoding, Y'CbCr encoding + * @quantization: enum v4l2_quantization, colorspace quantization */ struct v4l2_pix_format_mplane { __u32 width; @@ -1788,7 +1865,9 @@ struct v4l2_pix_format_mplane { struct v4l2_plane_pix_format plane_fmt[VIDEO_MAX_PLANES]; __u8 num_planes; __u8 flags; - __u8 reserved[10]; + __u8 ycbcr_enc; + __u8 quantization; + __u8 reserved[8]; } __attribute__ ((packed)); /** -- cgit v1.2.3 From 11ff030c73655ecd7b153c6bd00d26d27e6cc199 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 17 Nov 2014 09:10:33 -0300 Subject: [media] v4l2-mediabus: improve colorspace support Add and copy the new ycbcr_enc and quantization fields. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/media/v4l2-mediabus.h | 4 ++++ include/uapi/linux/v4l2-mediabus.h | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/media/v4l2-mediabus.h b/include/media/v4l2-mediabus.h index 59d7397fb0ea..38d960d8dccd 100644 --- a/include/media/v4l2-mediabus.h +++ b/include/media/v4l2-mediabus.h @@ -94,6 +94,8 @@ static inline void v4l2_fill_pix_format(struct v4l2_pix_format *pix_fmt, pix_fmt->height = mbus_fmt->height; pix_fmt->field = mbus_fmt->field; pix_fmt->colorspace = mbus_fmt->colorspace; + pix_fmt->ycbcr_enc = mbus_fmt->ycbcr_enc; + pix_fmt->quantization = mbus_fmt->quantization; } static inline void v4l2_fill_mbus_format(struct v4l2_mbus_framefmt *mbus_fmt, @@ -104,6 +106,8 @@ static inline void v4l2_fill_mbus_format(struct v4l2_mbus_framefmt *mbus_fmt, mbus_fmt->height = pix_fmt->height; mbus_fmt->field = pix_fmt->field; mbus_fmt->colorspace = pix_fmt->colorspace; + mbus_fmt->ycbcr_enc = pix_fmt->ycbcr_enc; + mbus_fmt->quantization = pix_fmt->quantization; mbus_fmt->code = code; } diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h index b1934a3961e4..5a86d8ede09c 100644 --- a/include/uapi/linux/v4l2-mediabus.h +++ b/include/uapi/linux/v4l2-mediabus.h @@ -22,6 +22,8 @@ * @code: data format code (from enum v4l2_mbus_pixelcode) * @field: used interlacing type (from enum v4l2_field) * @colorspace: colorspace of the data (from enum v4l2_colorspace) + * @ycbcr_enc: YCbCr encoding of the data (from enum v4l2_ycbcr_encoding) + * @quantization: quantization of the data (from enum v4l2_quantization) */ struct v4l2_mbus_framefmt { __u32 width; @@ -29,7 +31,9 @@ struct v4l2_mbus_framefmt { __u32 code; __u32 field; __u32 colorspace; - __u32 reserved[7]; + __u32 ycbcr_enc; + __u32 quantization; + __u32 reserved[5]; }; #ifndef __KERNEL__ -- cgit v1.2.3 From 3f4994cfc15f38a3159c6e3a4b3ab2e1481a6b02 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 2 Dec 2014 16:52:06 +1100 Subject: kcmp: Move kcmp.h into uapi kcmp.h appears to be part of the API, it's documented in kcmp(2), and the selftests/kcmp code uses it. So move it to uapi so it's actually exported. Signed-off-by: Michael Ellerman Acked-by: Cyrill Gorcunov Signed-off-by: Shuah Khan --- include/linux/kcmp.h | 17 ----------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/kcmp.h | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 17 deletions(-) delete mode 100644 include/linux/kcmp.h create mode 100644 include/uapi/linux/kcmp.h (limited to 'include/uapi') diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h deleted file mode 100644 index 2dcd1b3aafc8..000000000000 --- a/include/linux/kcmp.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _LINUX_KCMP_H -#define _LINUX_KCMP_H - -/* Comparison type */ -enum kcmp_type { - KCMP_FILE, - KCMP_VM, - KCMP_FILES, - KCMP_FS, - KCMP_SIGHAND, - KCMP_IO, - KCMP_SYSVSEM, - - KCMP_TYPES, -}; - -#endif /* _LINUX_KCMP_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index b70237e8bc37..1cf50d682dbf 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -209,6 +209,7 @@ header-y += ivtvfb.h header-y += ixjuser.h header-y += jffs2.h header-y += joystick.h +header-y += kcmp.h header-y += kd.h header-y += kdev_t.h header-y += kernel-page-flags.h diff --git a/include/uapi/linux/kcmp.h b/include/uapi/linux/kcmp.h new file mode 100644 index 000000000000..84df14b37360 --- /dev/null +++ b/include/uapi/linux/kcmp.h @@ -0,0 +1,17 @@ +#ifndef _UAPI_LINUX_KCMP_H +#define _UAPI_LINUX_KCMP_H + +/* Comparison type */ +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + + KCMP_TYPES, +}; + +#endif /* _UAPI_LINUX_KCMP_H */ -- cgit v1.2.3 From 3682f49f32051765ed6eb77fc882f0458f7d44c3 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Tue, 2 Dec 2014 21:27:50 +0100 Subject: NFC: netlink: Add new netlink command NFC_CMD_ACTIVATE_TARGET Some tag might get deactivated after some read or write tentative. This may happen for example with Mifare Ultralight C tag when trying to read the last 4 blocks (starting block 0x2c) configured as write only. NFC_CMD_ACTIVATE_TARGET will try to reselect the tag in order to detect if it got remove from the field or if it is still present. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 1 + net/nfc/netlink.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 19a75daac14c..3c5efb1bc393 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -116,6 +116,7 @@ enum nfc_commands { NFC_EVENT_SE_TRANSACTION, NFC_CMD_GET_SE, NFC_CMD_SE_IO, + NFC_CMD_ACTIVATE_TARGET, /* private: internal use only */ __NFC_CMD_AFTER_LAST }; diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 43cb1c17e267..95818314aea6 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -810,6 +810,31 @@ out: return rc; } +static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) +{ + struct nfc_dev *dev; + u32 device_idx, target_idx, protocol; + int rc; + + if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) + return -EINVAL; + + device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); + + dev = nfc_get_device(device_idx); + if (!dev) + return -ENODEV; + + target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); + protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); + + nfc_deactivate_target(dev, target_idx); + rc = nfc_activate_target(dev, target_idx, protocol); + + nfc_put_device(dev); + return 0; +} + static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; @@ -1455,6 +1480,11 @@ static const struct genl_ops nfc_genl_ops[] = { .doit = nfc_genl_se_io, .policy = nfc_genl_policy, }, + { + .cmd = NFC_CMD_ACTIVATE_TARGET, + .doit = nfc_genl_activate_target, + .policy = nfc_genl_policy, + }, }; -- cgit v1.2.3 From e479ce479743984a5d4581749f9aaa9c3bfd65e4 Mon Sep 17 00:00:00 2001 From: Julien Lefrique Date: Tue, 2 Dec 2014 16:25:01 +0100 Subject: NFC: NCI: Fix max length of General Bytes in ATR_RES The maximum size of ATR_REQ and ATR_RES is 64 bytes. The maximum number of General Bytes is calculated by the maximum number of data bytes in the ATR_REQ/ATR_RES, substracted by the number of mandatory data bytes. ATR_REQ: 16 mandatory data bytes, giving a maximum of 48 General Bytes. ATR_RES: 17 mandatory data bytes, giving a maximum of 47 General Bytes. Regression introduced in commit a99903ec. Signed-off-by: Julien Lefrique Signed-off-by: Samuel Ortiz --- include/uapi/linux/nfc.h | 2 ++ net/nfc/nci/ntf.c | 8 +++----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h index 3c5efb1bc393..8119255feae4 100644 --- a/include/uapi/linux/nfc.h +++ b/include/uapi/linux/nfc.h @@ -205,6 +205,8 @@ enum nfc_sdp_attr { #define NFC_SENSF_RES_MAXSIZE 18 #define NFC_ATR_REQ_MAXSIZE 64 #define NFC_ATR_RES_MAXSIZE 64 +#define NFC_ATR_REQ_GB_MAXSIZE 48 +#define NFC_ATR_RES_GB_MAXSIZE 47 #define NFC_GB_MAXSIZE 48 #define NFC_FIRMWARE_NAME_MAXSIZE 32 #define NFC_ISO15693_UID_MAXSIZE 8 diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 8dee73d0c4e1..22e453cb787d 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -479,24 +479,22 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, switch (ntf->activation_rf_tech_and_mode) { case NCI_NFC_A_PASSIVE_POLL_MODE: case NCI_NFC_F_PASSIVE_POLL_MODE: - /* ATR_RES general bytes at offset 15 */ ndev->remote_gb_len = min_t(__u8, (ntf->activation_params.poll_nfc_dep.atr_res_len - NFC_ATR_RES_GT_OFFSET), - NFC_MAX_GT_LEN); + NFC_ATR_RES_GB_MAXSIZE); memcpy(ndev->remote_gb, - (ntf->activation_params.poll_nfc_dep .atr_res + (ntf->activation_params.poll_nfc_dep.atr_res + NFC_ATR_RES_GT_OFFSET), ndev->remote_gb_len); break; case NCI_NFC_A_PASSIVE_LISTEN_MODE: case NCI_NFC_F_PASSIVE_LISTEN_MODE: - /* ATR_REQ general bytes at offset 14 */ ndev->remote_gb_len = min_t(__u8, (ntf->activation_params.listen_nfc_dep.atr_req_len - NFC_ATR_REQ_GT_OFFSET), - NFC_MAX_GT_LEN); + NFC_ATR_REQ_GB_MAXSIZE); memcpy(ndev->remote_gb, (ntf->activation_params.listen_nfc_dep.atr_req + NFC_ATR_REQ_GT_OFFSET), -- cgit v1.2.3 From b7485f6b035a87685ce35e0e52deee6467811eb0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 28 Nov 2014 14:34:13 +0100 Subject: neigh: sort Neighbor Cache Entry Flags Suggested-by: Florian Fainelli Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 4a1d7e96dfe3..2f043af7fcd6 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -35,11 +35,10 @@ enum { */ #define NTF_USE 0x01 -#define NTF_PROXY 0x08 /* == ATF_PUBL */ -#define NTF_ROUTER 0x80 - #define NTF_SELF 0x02 #define NTF_MASTER 0x04 +#define NTF_PROXY 0x08 /* == ATF_PUBL */ +#define NTF_ROUTER 0x80 /* * Neighbor Cache Entry States. -- cgit v1.2.3 From 82f2841291cfaf4d225aa1766424280254d3e3b2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 28 Nov 2014 14:34:18 +0100 Subject: rtnl: expose physical switch id for particular device The netdevice represents a port in a switch, it will expose IFLA_PHYS_SWITCH_ID value via rtnl. Two netdevices with the same value belong to one physical switch. Signed-off-by: Jiri Pirko Reviewed-by: Thomas Graf Acked-by: John Fastabend Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 36bddc233633..623f1a7c7627 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -145,6 +145,7 @@ enum { IFLA_CARRIER, IFLA_PHYS_PORT_ID, IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0640f1fbe9dd..aa5cd2c53a56 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -869,7 +870,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -968,6 +970,24 @@ static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) return 0; } +static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) +{ + int err; + struct netdev_phys_item_id psid; + + err = netdev_switch_parent_id_get(dev, &psid); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id)) + return -EMSGSIZE; + + return 0; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask) @@ -1040,6 +1060,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; + if (rtnl_phys_switch_id_fill(skb, dev)) + goto nla_put_failure; + attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (attr == NULL) @@ -1199,6 +1222,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ + [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From cf6b8e1eedffd9ef9a22c0c9453d752b07daf89a Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 28 Nov 2014 14:34:21 +0100 Subject: bridge: add API to notify bridge driver of learned FBD on offloaded device When the swdev device learns a new mac/vlan on a port, it sends some async notification to the driver and the driver installs an FDB in the device. To give a holistic system view, the learned mac/vlan should be reflected in the bridge's FBD table, so the user, using normal iproute2 cmds, can view what is currently learned by the device. This API on the bridge driver gives a way for the swdev driver to install an FBD entry in the bridge FBD table. (And remove one). This is equivalent to the device running these cmds: bridge fdb [add|del] dev vid master This patch needs some extra eyeballs for review, in paricular around the locking and contexts. Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 18 +++++++++ include/uapi/linux/neighbour.h | 1 + net/bridge/br_fdb.c | 91 +++++++++++++++++++++++++++++++++++++++++- net/bridge/br_private.h | 3 +- 4 files changed, 111 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 808dcb8cc04f..fa2eca625129 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -37,6 +37,24 @@ extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __use typedef int br_should_route_hook_t(struct sk_buff *skb); extern br_should_route_hook_t __rcu *br_should_route_hook; +#if IS_ENABLED(CONFIG_BRIDGE) +int br_fdb_external_learn_add(struct net_device *dev, + const unsigned char *addr, u16 vid); +int br_fdb_external_learn_del(struct net_device *dev, + const unsigned char *addr, u16 vid); +#else +static inline int br_fdb_external_learn_add(struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + return 0; +} +static inline int br_fdb_external_learn_del(struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + return 0; +} +#endif + #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list); diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 2f043af7fcd6..f3d77f9f1e0b 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -38,6 +38,7 @@ enum { #define NTF_SELF 0x02 #define NTF_MASTER 0x04 #define NTF_PROXY 0x08 /* == ATF_PUBL */ +#define NTF_EXT_LEARNED 0x10 #define NTF_ROUTER 0x80 /* diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b1be971eb06c..cc36e59db7d7 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -481,6 +481,7 @@ static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, fdb->is_local = 0; fdb->is_static = 0; fdb->added_by_user = 0; + fdb->added_by_external_learn = 0; fdb->updated = fdb->used = jiffies; hlist_add_head_rcu(&fdb->hlist, head); } @@ -613,7 +614,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = 0; + ndm->ndm_flags = fdb->added_by_external_learn ? NTF_EXT_LEARNED : 0; ndm->ndm_type = 0; ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex; ndm->ndm_state = fdb_to_nud(fdb); @@ -983,3 +984,91 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) } } } + +int br_fdb_external_learn_add(struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + struct net_bridge_port *p; + struct net_bridge *br; + struct hlist_head *head; + struct net_bridge_fdb_entry *fdb; + int err = 0; + + rtnl_lock(); + + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: %s not a bridge port\n", dev->name); + err = -EINVAL; + goto err_rtnl_unlock; + } + + br = p->br; + + spin_lock_bh(&br->hash_lock); + + head = &br->hash[br_mac_hash(addr, vid)]; + fdb = fdb_find(head, addr, vid); + if (!fdb) { + fdb = fdb_create(head, p, addr, vid); + if (!fdb) { + err = -ENOMEM; + goto err_unlock; + } + fdb->added_by_external_learn = 1; + fdb_notify(br, fdb, RTM_NEWNEIGH); + } else if (fdb->added_by_external_learn) { + /* Refresh entry */ + fdb->updated = fdb->used = jiffies; + } else if (!fdb->added_by_user) { + /* Take over SW learned entry */ + fdb->added_by_external_learn = 1; + fdb->updated = jiffies; + fdb_notify(br, fdb, RTM_NEWNEIGH); + } + +err_unlock: + spin_unlock_bh(&br->hash_lock); +err_rtnl_unlock: + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL(br_fdb_external_learn_add); + +int br_fdb_external_learn_del(struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + struct net_bridge_port *p; + struct net_bridge *br; + struct hlist_head *head; + struct net_bridge_fdb_entry *fdb; + int err = 0; + + rtnl_lock(); + + p = br_port_get_rtnl(dev); + if (!p) { + pr_info("bridge: %s not a bridge port\n", dev->name); + err = -EINVAL; + goto err_rtnl_unlock; + } + + br = p->br; + + spin_lock_bh(&br->hash_lock); + + head = &br->hash[br_mac_hash(addr, vid)]; + fdb = fdb_find(head, addr, vid); + if (fdb && fdb->added_by_external_learn) + fdb_delete(br, fdb); + else + err = -ENOENT; + + spin_unlock_bh(&br->hash_lock); +err_rtnl_unlock: + rtnl_unlock(); + + return err; +} +EXPORT_SYMBOL(br_fdb_external_learn_del); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1b529da8234d..cc36fb3efbdd 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -100,7 +100,8 @@ struct net_bridge_fdb_entry mac_addr addr; unsigned char is_local:1, is_static:1, - added_by_user:1; + added_by_user:1, + added_by_external_learn:1; __u16 vlan_id; }; -- cgit v1.2.3 From efacacdaf7cb5a0592ed772e3731636b2742e34a Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 28 Nov 2014 14:34:23 +0100 Subject: bridge: add new brport flag LEARNING_SYNC This policy flag controls syncing of learned FDB entries to bridge's FDB. If on, FDB entries learned on bridge port device will be synced. If off, device may still learn new FDB entries but they will not be synced with bridge's FDB. Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Acked-by: Roopa Prabhu Acked-by: Jamal Hadi Salim Acked-by: Andy Gospodarek Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 2c81a8efd24d..0a8ce762a47f 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -43,6 +43,7 @@ struct br_ip_list { #define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING) #define BR_PROMISC BIT(7) #define BR_PROXYARP BIT(8) +#define BR_LEARNING_SYNC BIT(9) extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 623f1a7c7627..f7d0d2d7173a 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -245,6 +245,7 @@ enum { IFLA_BRPORT_LEARNING, /* mac learning */ IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 345cd494a324c149ef7293f4bedf2d7131a6de7c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Fri, 28 Nov 2014 14:34:24 +0100 Subject: bridge: add new hwmode swdev Current hwmode settings are "vepa" or "veb". These are for NIC interfaces with basic bridging function offloaded to HW. Add new "swdev" for full switch device offloads. Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index da17e456908d..296a556454e3 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -105,6 +105,7 @@ struct __fdb_entry { #define BRIDGE_MODE_VEB 0 /* Default loopback mode */ #define BRIDGE_MODE_VEPA 1 /* 802.1Qbg defined VEPA mode */ +#define BRIDGE_MODE_SWDEV 2 /* Full switch device offload */ /* Bridge management nested attributes * [IFLA_AF_SPEC] = { -- cgit v1.2.3 From a51b9199b1e092da5ee4a89852e84b4c52ae6044 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Sun, 30 Nov 2014 19:56:53 +0100 Subject: netfilter: ipset: Alignment problem between 64bit kernel 32bit userspace Sven-Haegar Koch reported the issue: sims:~# iptables -A OUTPUT -m set --match-set testset src -j ACCEPT iptables: Invalid argument. Run `dmesg' for more information. In syslog: x_tables: ip_tables: set.3 match: invalid size 48 (kernel) != (user) 32 which was introduced by the counter extension in ipset. The patch fixes the alignment issue with introducing a new set match revision with the fixed underlying 'struct ip_set_counter_match' structure. Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/ipset/ip_set.h | 8 +++- include/uapi/linux/netfilter/xt_set.h | 13 ++++- net/netfilter/xt_set.c | 73 +++++++++++++++++++++++++++-- 3 files changed, 88 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index ca03119111a2..5ab4e60894cf 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -256,11 +256,17 @@ enum { IPSET_COUNTER_GT, }; -struct ip_set_counter_match { +/* Backward compatibility for set match v3 */ +struct ip_set_counter_match0 { __u8 op; __u64 value; }; +struct ip_set_counter_match { + __aligned_u64 value; + __u8 op; +}; + /* Interface to iptables/ip6tables */ #define SO_IP_SET 83 diff --git a/include/uapi/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h index d6a1df1f2947..d4e02348384c 100644 --- a/include/uapi/linux/netfilter/xt_set.h +++ b/include/uapi/linux/netfilter/xt_set.h @@ -66,8 +66,8 @@ struct xt_set_info_target_v2 { struct xt_set_info_match_v3 { struct xt_set_info match_set; - struct ip_set_counter_match packets; - struct ip_set_counter_match bytes; + struct ip_set_counter_match0 packets; + struct ip_set_counter_match0 bytes; __u32 flags; }; @@ -81,4 +81,13 @@ struct xt_set_info_target_v3 { __u32 timeout; }; +/* Revision 4 match */ + +struct xt_set_info_match_v4 { + struct xt_set_info match_set; + struct ip_set_counter_match packets; + struct ip_set_counter_match bytes; + __u32 flags; +}; + #endif /*_XT_SET_H*/ diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 5732cd64acc0..0d47afea9682 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -157,7 +157,7 @@ set_match_v1_destroy(const struct xt_mtdtor_param *par) /* Revision 3 match */ static bool -match_counter(u64 counter, const struct ip_set_counter_match *info) +match_counter0(u64 counter, const struct ip_set_counter_match0 *info) { switch (info->op) { case IPSET_COUNTER_NONE: @@ -192,14 +192,60 @@ set_match_v3(const struct sk_buff *skb, struct xt_action_param *par) if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) return ret; - if (!match_counter(opt.ext.packets, &info->packets)) + if (!match_counter0(opt.ext.packets, &info->packets)) return 0; - return match_counter(opt.ext.bytes, &info->bytes); + return match_counter0(opt.ext.bytes, &info->bytes); } #define set_match_v3_checkentry set_match_v1_checkentry #define set_match_v3_destroy set_match_v1_destroy +/* Revision 4 match */ + +static bool +match_counter(u64 counter, const struct ip_set_counter_match *info) +{ + switch (info->op) { + case IPSET_COUNTER_NONE: + return true; + case IPSET_COUNTER_EQ: + return counter == info->value; + case IPSET_COUNTER_NE: + return counter != info->value; + case IPSET_COUNTER_LT: + return counter < info->value; + case IPSET_COUNTER_GT: + return counter > info->value; + } + return false; +} + +static bool +set_match_v4(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_set_info_match_v4 *info = par->matchinfo; + ADT_OPT(opt, par->family, info->match_set.dim, + info->match_set.flags, info->flags, UINT_MAX); + int ret; + + if (info->packets.op != IPSET_COUNTER_NONE || + info->bytes.op != IPSET_COUNTER_NONE) + opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS; + + ret = match_set(info->match_set.index, skb, par, &opt, + info->match_set.flags & IPSET_INV_MATCH); + + if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS)) + return ret; + + if (!match_counter(opt.ext.packets, &info->packets)) + return 0; + return match_counter(opt.ext.bytes, &info->bytes); +} + +#define set_match_v4_checkentry set_match_v1_checkentry +#define set_match_v4_destroy set_match_v1_destroy + /* Revision 0 interface: backward compatible with netfilter/iptables */ static unsigned int @@ -573,6 +619,27 @@ static struct xt_match set_matches[] __read_mostly = { .destroy = set_match_v3_destroy, .me = THIS_MODULE }, + /* new revision for counters support: update, match */ + { + .name = "set", + .family = NFPROTO_IPV4, + .revision = 4, + .match = set_match_v4, + .matchsize = sizeof(struct xt_set_info_match_v4), + .checkentry = set_match_v4_checkentry, + .destroy = set_match_v4_destroy, + .me = THIS_MODULE + }, + { + .name = "set", + .family = NFPROTO_IPV6, + .revision = 4, + .match = set_match_v4, + .matchsize = sizeof(struct xt_set_info_match_v4), + .checkentry = set_match_v4_checkentry, + .destroy = set_match_v4_destroy, + .me = THIS_MODULE + }, }; static struct xt_target set_targets[] __read_mostly = { -- cgit v1.2.3 From 0b4bc768dc289731983ba5223555ffabe08eefdd Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 15 Oct 2014 13:38:15 -0300 Subject: [media] v4l: Add V4L2_SEL_TGT_NATIVE_SIZE selection target The V4L2_SEL_TGT_NATIVE_SIZE target is used to denote e.g. the size of a sensor's pixel array. Signed-off-by: Sakari Ailus Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/selections-common.xml | 16 ++++++++++++++++ include/uapi/linux/v4l2-common.h | 2 ++ 2 files changed, 18 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/selections-common.xml b/Documentation/DocBook/media/v4l/selections-common.xml index 7502f784b8cc..d6d56fb6f9c0 100644 --- a/Documentation/DocBook/media/v4l/selections-common.xml +++ b/Documentation/DocBook/media/v4l/selections-common.xml @@ -62,6 +62,22 @@ Yes Yes + + V4L2_SEL_TGT_NATIVE_SIZE + 0x0003 + The native size of the device, e.g. a sensor's + pixel array. left and + top fields are zero for this + target. Setting the native size will generally only make + sense for memory to memory devices where the software can + create a canvas of a given size in which for example a + video frame can be composed. In that case + V4L2_SEL_TGT_NATIVE_SIZE can be used to configure the size + of that canvas. + + Yes + Yes + V4L2_SEL_TGT_COMPOSE 0x0100 diff --git a/include/uapi/linux/v4l2-common.h b/include/uapi/linux/v4l2-common.h index 2f6f8cafe773..15273987093e 100644 --- a/include/uapi/linux/v4l2-common.h +++ b/include/uapi/linux/v4l2-common.h @@ -43,6 +43,8 @@ #define V4L2_SEL_TGT_CROP_DEFAULT 0x0001 /* Cropping bounds */ #define V4L2_SEL_TGT_CROP_BOUNDS 0x0002 +/* Native frame size */ +#define V4L2_SEL_TGT_NATIVE_SIZE 0x0003 /* Current composing area */ #define V4L2_SEL_TGT_COMPOSE 0x0100 /* Default composing area */ -- cgit v1.2.3 From 8af0345926c8cdbbf80102aa3eab8a6519c23272 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Tue, 18 Nov 2014 02:27:00 -0300 Subject: [media] v4l: Add input and output capability flags for native size setting Add input and output capability flags for setting native size of the device, and document them. Signed-off-by: Sakari Ailus Acked-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/DocBook/media/v4l/vidioc-enuminput.xml | 8 ++++++++ Documentation/DocBook/media/v4l/vidioc-enumoutput.xml | 8 ++++++++ include/uapi/linux/videodev2.h | 2 ++ 3 files changed, 18 insertions(+) (limited to 'include/uapi') diff --git a/Documentation/DocBook/media/v4l/vidioc-enuminput.xml b/Documentation/DocBook/media/v4l/vidioc-enuminput.xml index 493a39a8ef21..603fecef9083 100644 --- a/Documentation/DocBook/media/v4l/vidioc-enuminput.xml +++ b/Documentation/DocBook/media/v4l/vidioc-enuminput.xml @@ -287,6 +287,14 @@ input/output interface to linux-media@vger.kernel.org on 19 Oct 2009. 0x00000004 This input supports setting the TV standard by using VIDIOC_S_STD. + + V4L2_IN_CAP_NATIVE_SIZE + 0x00000008 + This input supports setting the native size using + the V4L2_SEL_TGT_NATIVE_SIZE + selection target, see . + diff --git a/Documentation/DocBook/media/v4l/vidioc-enumoutput.xml b/Documentation/DocBook/media/v4l/vidioc-enumoutput.xml index 2654e097df39..773fb1258c24 100644 --- a/Documentation/DocBook/media/v4l/vidioc-enumoutput.xml +++ b/Documentation/DocBook/media/v4l/vidioc-enumoutput.xml @@ -172,6 +172,14 @@ input/output interface to linux-media@vger.kernel.org on 19 Oct 2009. 0x00000004 This output supports setting the TV standard by using VIDIOC_S_STD. + + V4L2_OUT_CAP_NATIVE_SIZE + 0x00000008 + This output supports setting the native size using + the V4L2_SEL_TGT_NATIVE_SIZE + selection target, see . + diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index ced659e1b9f6..d279c1b75cf7 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1324,6 +1324,7 @@ struct v4l2_input { #define V4L2_IN_CAP_DV_TIMINGS 0x00000002 /* Supports S_DV_TIMINGS */ #define V4L2_IN_CAP_CUSTOM_TIMINGS V4L2_IN_CAP_DV_TIMINGS /* For compatibility */ #define V4L2_IN_CAP_STD 0x00000004 /* Supports S_STD */ +#define V4L2_IN_CAP_NATIVE_SIZE 0x00000008 /* Supports setting native size */ /* * V I D E O O U T P U T S @@ -1347,6 +1348,7 @@ struct v4l2_output { #define V4L2_OUT_CAP_DV_TIMINGS 0x00000002 /* Supports S_DV_TIMINGS */ #define V4L2_OUT_CAP_CUSTOM_TIMINGS V4L2_OUT_CAP_DV_TIMINGS /* For compatibility */ #define V4L2_OUT_CAP_STD 0x00000004 /* Supports S_STD */ +#define V4L2_OUT_CAP_NATIVE_SIZE 0x00000008 /* Supports setting native size */ /* * C O N T R O L S -- cgit v1.2.3 From d0747f10ed5fec3d1f40c4b350dc9673011fc8e2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 4 Dec 2014 14:42:25 -0800 Subject: uapi: fix to export linux/vm_sockets.h A typo "header=y" was introduced by commit 7071cf7fc435 ("uapi: add missing network related headers to kbuild"). Signed-off-by: Masahiro Yamada Cc: Stephen Hemminger Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/Kbuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 4c94f31a8c99..8523f9bb72f2 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -427,7 +427,7 @@ header-y += virtio_net.h header-y += virtio_pci.h header-y += virtio_ring.h header-y += virtio_rng.h -header=y += vm_sockets.h +header-y += vm_sockets.h header-y += vt.h header-y += wait.h header-y += wanrouter.h -- cgit v1.2.3 From af8e80731a94ff9de9508b01d9e5d931d538dc6b Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Wed, 3 Dec 2014 20:55:42 +0100 Subject: crypto: af_alg - add user space interface for AEAD AEAD requires the caller to specify the following information separate from the data stream. This information allows the AEAD interface handler to identify the AAD, ciphertext/plaintext and the authentication tag: * Associated authentication data of arbitrary length and length * Length of authentication tag for encryption Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu --- crypto/af_alg.c | 6 ++++++ include/crypto/if_alg.h | 1 + include/uapi/linux/if_alg.h | 2 ++ 3 files changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 6a3ad8011585..68ff1137dd70 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -421,6 +421,12 @@ int af_alg_cmsg_send(struct msghdr *msg, struct af_alg_control *con) con->op = *(u32 *)CMSG_DATA(cmsg); break; + case ALG_SET_AEAD_ASSOCLEN: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(u32))) + return -EINVAL; + con->aead_assoclen = *(u32 *)CMSG_DATA(cmsg); + break; + default: return -EINVAL; } diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index d61c11170213..cd62bf4289e9 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -42,6 +42,7 @@ struct af_alg_completion { struct af_alg_control { struct af_alg_iv *iv; int op; + unsigned int aead_assoclen; }; struct af_alg_type { diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index 0f9acce5b1ff..f2acd2fde1f3 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -32,6 +32,8 @@ struct af_alg_iv { #define ALG_SET_KEY 1 #define ALG_SET_IV 2 #define ALG_SET_OP 3 +#define ALG_SET_AEAD_ASSOCLEN 4 +#define ALG_SET_AEAD_AUTHSIZE 5 /* Operations */ #define ALG_OP_DECRYPT 0 -- cgit v1.2.3 From 4fc11e61f46dd66730a18e8136607386007ee8c9 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Dec 2014 10:16:17 +0900 Subject: uapi: fix to export linux/vm_sockets.h A typo "header=y" was introduced by commit 7071cf7fc435 (uapi: add missing network related headers to kbuild). Signed-off-by: Masahiro Yamada Cc: Stephen Hemminger Signed-off-by: David S. Miller --- include/uapi/linux/Kbuild | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 4c94f31a8c99..8523f9bb72f2 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -427,7 +427,7 @@ header-y += virtio_net.h header-y += virtio_pci.h header-y += virtio_ring.h header-y += virtio_rng.h -header=y += vm_sockets.h +header-y += vm_sockets.h header-y += vt.h header-y += wait.h header-y += wanrouter.h -- cgit v1.2.3 From ddd872bc3098f9d9abe1680a6b2013e59e3337f7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Dec 2014 15:06:34 -0800 Subject: bpf: verifier: add checks for BPF_ABS | BPF_IND instructions introduce program type BPF_PROG_TYPE_SOCKET_FILTER that is used for attaching programs to sockets where ctx == skb. add verifier checks for ABS/IND instructions which can only be seen in socket filters, therefore the check: if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a3d0f84f178..45da7ec7d274 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -117,6 +117,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, }; /* flags for BPF_MAP_UPDATE_ELEM command */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6a1f7c14a67..a28e09c7825d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1172,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -1677,8 +1741,10 @@ process_bpf_exit: u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { - verbose("LD_ABS is not supported yet\n"); - return -EINVAL; + err = check_ld_abs(env, insn); + if (err) + return err; + } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) -- cgit v1.2.3 From 89aa075832b0da4402acebd698d0411dcc82d03e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Dec 2014 15:06:35 -0800 Subject: net: sock: allow eBPF programs to be attached to sockets introduce new setsockopt() command: setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) where prog_fd was received from syscall bpf(BPF_PROG_LOAD, attr, ...) and attr->prog_type == BPF_PROG_TYPE_SOCKET_FILTER setsockopt() calls bpf_prog_get() which increments refcnt of the program, so it doesn't get unloaded while socket is using the program. The same eBPF program can be attached to multiple sockets. User task exit automatically closes socket which calls sk_filter_uncharge() which decrements refcnt of eBPF program Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 3 ++ arch/avr32/include/uapi/asm/socket.h | 3 ++ arch/cris/include/uapi/asm/socket.h | 3 ++ arch/frv/include/uapi/asm/socket.h | 3 ++ arch/ia64/include/uapi/asm/socket.h | 3 ++ arch/m32r/include/uapi/asm/socket.h | 3 ++ arch/mips/include/uapi/asm/socket.h | 3 ++ arch/mn10300/include/uapi/asm/socket.h | 3 ++ arch/parisc/include/uapi/asm/socket.h | 3 ++ arch/powerpc/include/uapi/asm/socket.h | 3 ++ arch/s390/include/uapi/asm/socket.h | 3 ++ arch/sparc/include/uapi/asm/socket.h | 3 ++ arch/xtensa/include/uapi/asm/socket.h | 3 ++ include/linux/bpf.h | 4 ++ include/linux/filter.h | 1 + include/uapi/asm-generic/socket.h | 3 ++ net/core/filter.c | 97 +++++++++++++++++++++++++++++++++- net/core/sock.c | 13 +++++ 18 files changed, 155 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index e2fe0700b3b4..9a20821b111c 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -89,4 +89,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 92121b0f5b98..2b65ed6b277c 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -82,4 +82,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/cris/include/uapi/asm/socket.h b/arch/cris/include/uapi/asm/socket.h index 60f60f5b9b35..e2503d9f1869 100644 --- a/arch/cris/include/uapi/asm/socket.h +++ b/arch/cris/include/uapi/asm/socket.h @@ -84,6 +84,9 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index 2c6890209ea6..4823ad125578 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -82,5 +82,8 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 09a93fb566f6..59be3d87f86d 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -91,4 +91,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index e8589819c274..7bc4cb273856 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -82,4 +82,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 2e9ee8c55a10..dec3c850f36b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -100,4 +100,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index f3492e8c9f70..cab7d6d50051 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -82,4 +82,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 7984a1cab3da..a5cd40cd8ee1 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -81,4 +81,7 @@ #define SO_INCOMING_CPU 0x402A +#define SO_ATTACH_BPF 0x402B +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 3474e4ef166d..c046666038f8 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -89,4 +89,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 8457636c33e1..296942d56e6a 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -88,4 +88,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _ASM_SOCKET_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 4a8003a94163..e6a16c40be5f 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -78,6 +78,9 @@ #define SO_INCOMING_CPU 0x0033 +#define SO_ATTACH_BPF 0x0034 +#define SO_DETACH_BPF SO_DETACH_FILTER + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index c46f6a696849..4120af086160 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -93,4 +93,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* _XTENSA_SOCKET_H */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 75e94eaa228b..bbfceb756452 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -128,7 +128,11 @@ struct bpf_prog_aux { struct work_struct work; }; +#ifdef CONFIG_BPF_SYSCALL void bpf_prog_put(struct bpf_prog *prog); +#else +static inline void bpf_prog_put(struct bpf_prog *prog) {} +#endif struct bpf_prog *bpf_prog_get(u32 ufd); /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); diff --git a/include/linux/filter.h b/include/linux/filter.h index ca95abd2bed1..caac2087a4d5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -381,6 +381,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); +int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_detach_filter(struct sock *sk); int bpf_check_classic(const struct sock_filter *filter, unsigned int flen); diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index f541ccefd4ac..5c15c2a5c123 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -84,4 +84,7 @@ #define SO_INCOMING_CPU 49 +#define SO_ATTACH_BPF 50 +#define SO_DETACH_BPF SO_DETACH_FILTER + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/net/core/filter.c b/net/core/filter.c index 647b12265e18..8cc3c03078b3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -44,6 +44,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -813,8 +814,12 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - bpf_release_orig_filter(prog); - bpf_prog_free(prog); + if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(prog); + } else { + bpf_release_orig_filter(prog); + bpf_prog_free(prog); + } } static void __sk_filter_release(struct sk_filter *fp) @@ -1088,6 +1093,94 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); +#ifdef CONFIG_BPF_SYSCALL +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct sk_filter *fp, *old_fp; + struct bpf_prog *prog; + + if (sock_flag(sk, SOCK_FILTER_LOCKED)) + return -EPERM; + + prog = bpf_prog_get(ufd); + if (!prog) + return -EINVAL; + + if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + /* valid fd, but invalid program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + bpf_prog_put(prog); + return -ENOMEM; + } + fp->prog = prog; + + atomic_set(&fp->refcnt, 0); + + if (!sk_filter_charge(sk, fp)) { + __sk_filter_release(fp); + return -ENOMEM; + } + + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); + rcu_assign_pointer(sk->sk_filter, fp); + + if (old_fp) + sk_filter_uncharge(sk, old_fp); + + return 0; +} + +/* allow socket filters to call + * bpf_map_lookup_elem(), bpf_map_update_elem(), bpf_map_delete_elem() + */ +static const struct bpf_func_proto *sock_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + default: + return NULL; + } +} + +static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* skb fields cannot be accessed yet */ + return false; +} + +static struct bpf_verifier_ops sock_filter_ops = { + .get_func_proto = sock_filter_func_proto, + .is_valid_access = sock_filter_is_valid_access, +}; + +static struct bpf_prog_type_list tl = { + .ops = &sock_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, +}; + +static int __init register_sock_filter_ops(void) +{ + bpf_register_prog_type(&tl); + return 0; +} +late_initcall(register_sock_filter_ops); +#else +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + return -EOPNOTSUPP; +} +#endif int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/sock.c b/net/core/sock.c index 0725cf0cb685..9a56b2000c3f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -888,6 +888,19 @@ set_rcvbuf: } break; + case SO_ATTACH_BPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; -- cgit v1.2.3 From bac78aabcfece0c493b2ad824c68fbdc20448cbc Mon Sep 17 00:00:00 2001 From: Andri Yngvason Date: Wed, 3 Dec 2014 17:54:13 +0000 Subject: can: dev: Consolidate and unify state change handling The handling of can error states is different between platforms. This is an attempt to correct that problem. I've moved this handling into a generic function for changing the error state. This ensures that error state changes are handled the same way everywhere (where this function is used). This new mechanism also adds reverse state transitioning in error frames, i.e. the user will be notified through the socket interface when the state goes down. Signed-off-by: Andri Yngvason Acked-by: Wolfgang Grandegger Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 78 ++++++++++++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 3 ++ include/uapi/linux/can/error.h | 1 + 3 files changed, 82 insertions(+) (limited to 'include/uapi') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 2cfe5012e4e5..3ec8f6f25e5f 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -273,6 +273,84 @@ static int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, return err; } +static void can_update_state_error_stats(struct net_device *dev, + enum can_state new_state) +{ + struct can_priv *priv = netdev_priv(dev); + + if (new_state <= priv->state) + return; + + switch (new_state) { + case CAN_STATE_ERROR_WARNING: + priv->can_stats.error_warning++; + break; + case CAN_STATE_ERROR_PASSIVE: + priv->can_stats.error_passive++; + break; + case CAN_STATE_BUS_OFF: + default: + break; + }; +} + +static int can_tx_state_to_frame(struct net_device *dev, enum can_state state) +{ + switch (state) { + case CAN_STATE_ERROR_ACTIVE: + return CAN_ERR_CRTL_ACTIVE; + case CAN_STATE_ERROR_WARNING: + return CAN_ERR_CRTL_TX_WARNING; + case CAN_STATE_ERROR_PASSIVE: + return CAN_ERR_CRTL_TX_PASSIVE; + default: + return 0; + } +} + +static int can_rx_state_to_frame(struct net_device *dev, enum can_state state) +{ + switch (state) { + case CAN_STATE_ERROR_ACTIVE: + return CAN_ERR_CRTL_ACTIVE; + case CAN_STATE_ERROR_WARNING: + return CAN_ERR_CRTL_RX_WARNING; + case CAN_STATE_ERROR_PASSIVE: + return CAN_ERR_CRTL_RX_PASSIVE; + default: + return 0; + } +} + +void can_change_state(struct net_device *dev, struct can_frame *cf, + enum can_state tx_state, enum can_state rx_state) +{ + struct can_priv *priv = netdev_priv(dev); + enum can_state new_state = max(tx_state, rx_state); + + if (unlikely(new_state == priv->state)) { + netdev_warn(dev, "%s: oops, state did not change", __func__); + return; + } + + netdev_dbg(dev, "New error state: %d\n", new_state); + + can_update_state_error_stats(dev, new_state); + priv->state = new_state; + + if (unlikely(new_state == CAN_STATE_BUS_OFF)) { + cf->can_id |= CAN_ERR_BUSOFF; + return; + } + + cf->can_id |= CAN_ERR_CRTL; + cf->data[1] |= tx_state >= rx_state ? + can_tx_state_to_frame(dev, tx_state) : 0; + cf->data[1] |= tx_state <= rx_state ? + can_rx_state_to_frame(dev, rx_state) : 0; +} +EXPORT_SYMBOL_GPL(can_change_state); + /* * Local echo of CAN messages * diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index b37ea95bc348..c05ff0f9f9a5 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -127,6 +127,9 @@ void unregister_candev(struct net_device *dev); int can_restart_now(struct net_device *dev); void can_bus_off(struct net_device *dev); +void can_change_state(struct net_device *dev, struct can_frame *cf, + enum can_state tx_state, enum can_state rx_state); + void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); diff --git a/include/uapi/linux/can/error.h b/include/uapi/linux/can/error.h index c247446ab25a..1c508be9687f 100644 --- a/include/uapi/linux/can/error.h +++ b/include/uapi/linux/can/error.h @@ -71,6 +71,7 @@ #define CAN_ERR_CRTL_TX_PASSIVE 0x20 /* reached error passive status TX */ /* (at least one error counter exceeds */ /* the protocol-defined level of 127) */ +#define CAN_ERR_CRTL_ACTIVE 0x40 /* recovered to error active state */ /* error in CAN protocol (type) / data[2] */ #define CAN_ERR_PROT_UNSPEC 0x00 /* unspecified */ -- cgit v1.2.3 From 4fdace8d4f8bee9ba163646e8530394f9c36e67e Mon Sep 17 00:00:00 2001 From: Ley Foon Tan Date: Thu, 6 Nov 2014 15:19:54 +0800 Subject: Add ELF machine define for Nios2 Signed-off-by: Ley Foon Tan Acked-by: Arnd Bergmann --- include/uapi/linux/elf-em.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h index aa90bc98b6e2..ae99f7743cf4 100644 --- a/include/uapi/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h @@ -34,6 +34,7 @@ #define EM_MN10300 89 /* Panasonic/MEI MN10300, AM33 */ #define EM_OPENRISC 92 /* OpenRISC 32-bit embedded processor */ #define EM_BLACKFIN 106 /* ADI Blackfin Processor */ +#define EM_ALTERA_NIOS2 113 /* Altera Nios II soft-core processor */ #define EM_TI_C6000 140 /* TI C6X DSPs */ #define EM_AARCH64 183 /* ARM 64 bit */ #define EM_FRV 0x5441 /* Fujitsu FR-V */ -- cgit v1.2.3 From 829ae9d611651467fe6cd7be834bd33ca6b28dfe Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sun, 30 Nov 2014 22:22:34 -0500 Subject: net-timestamp: allow reading recv cmsg on errqueue with origin tstamp Allow reading of timestamps and cmsg at the same time on all relevant socket families. One use is to correlate timestamps with egress device, by asking for cmsg IP_PKTINFO. on AF_INET sockets, call the relevant function (ip_cmsg_recv). To avoid changing legacy expectations, only do so if the caller sets a new timestamping flag SOF_TIMESTAMPING_OPT_CMSG. on AF_INET6 sockets, IPV6_PKTINFO and all other recv cmsg are already returned for all origins. only change is to set ifindex, which is not initialized for all error origins. In both cases, only generate the pktinfo message if an ifindex is known. This is not the case for ACK timestamps. The difference between the protocol families is probably a historical accident as a result of the different conditions for generating cmsg in the relevant ip(v6)_recv_error function: ipv4: if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { ipv6: if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { At one time, this was the same test bar for the ICMP/ICMP6 distinction. This is no longer true. Signed-off-by: Willem de Bruijn ---- Changes v1 -> v2 large rewrite - integrate with existing pktinfo cmsg generation code - on ipv4: only send with new flag, to maintain legacy behavior - on ipv6: send at most a single pktinfo cmsg - on ipv6: initialize fields if not yet initialized The recv cmsg interfaces are also relevant to the discussion of whether looping packet headers is problematic. For v6, cmsgs that identify many headers are already returned. This patch expands that to v4. If it sounds reasonable, I will follow with patches 1. request timestamps without payload with SOF_TIMESTAMPING_OPT_TSONLY (http://patchwork.ozlabs.org/patch/366967/) 2. sysctl to conditionally drop all timestamps that have payload or cmsg from users without CAP_NET_RAW. Signed-off-by: David S. Miller --- Documentation/networking/timestamping.txt | 12 +++++++++++- include/uapi/linux/net_tstamp.h | 3 ++- net/ipv4/ip_sockglue.c | 22 ++++++++++++++++++++-- net/ipv6/datagram.c | 21 +++++++++++++++++++-- 4 files changed, 52 insertions(+), 6 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/networking/timestamping.txt b/Documentation/networking/timestamping.txt index 1d6d02d6ba52..b08e27261ff9 100644 --- a/Documentation/networking/timestamping.txt +++ b/Documentation/networking/timestamping.txt @@ -122,7 +122,7 @@ SOF_TIMESTAMPING_RAW_HARDWARE: 1.3.3 Timestamp Options -The interface supports one option +The interface supports the options SOF_TIMESTAMPING_OPT_ID: @@ -145,6 +145,16 @@ SOF_TIMESTAMPING_OPT_ID: stream sockets, it increments with every byte. +SOF_TIMESTAMPING_OPT_CMSG: + + Support recv() cmsg for all timestamped packets. Control messages + are already supported unconditionally on all packets with receive + timestamps and on IPv6 packets with transmit timestamp. This option + extends them to IPv4 packets with transmit timestamp. One use case + is to correlate packets with their egress device, by enabling socket + option IP_PKTINFO simultaneously. + + 1.4 Bytestream Timestamps The SO_TIMESTAMPING interface supports timestamping of bytes in a diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index ff354021bb69..edbc888ceb51 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -23,8 +23,9 @@ enum { SOF_TIMESTAMPING_OPT_ID = (1<<7), SOF_TIMESTAMPING_TX_SCHED = (1<<8), SOF_TIMESTAMPING_TX_ACK = (1<<9), + SOF_TIMESTAMPING_OPT_CMSG = (1<<10), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_CMSG, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 59eba6c7a512..640f26c6a9fe 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -399,6 +399,22 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } +static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, + const struct sk_buff *skb, + int ee_origin) +{ + struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + + if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || + (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + (!skb->dev)) + return false; + + info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; + info->ipi_ifindex = skb->dev->ifindex; + return true; +} + /* * Handle MSG_ERRQUEUE */ @@ -446,7 +462,9 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; sin->sin_family = AF_UNSPEC; - if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin)) { struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; @@ -1051,7 +1069,7 @@ e_inval: } /** - * ipv4_pktinfo_prepare - transfert some info from rtable to skb + * ipv4_pktinfo_prepare - transfer some info from rtable to skb * @sk: socket * @skb: buffer * diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index cc1139687fd7..2464a00e36ab 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,6 +325,16 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } +static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +{ + int ifindex = skb->dev ? skb->dev->ifindex : -1; + + if (skb->protocol == htons(ETH_P_IPV6)) + IP6CB(skb)->iif = ifindex; + else + PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; +} + /* * Handle MSG_ERRQUEUE */ @@ -388,8 +398,12 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; - if (np->rxopt.all) + if (np->rxopt.all) { + if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && + serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) + ip6_datagram_prepare_pktinfo_errqueue(skb); ip6_datagram_recv_common_ctl(sk, msg, skb); + } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) @@ -491,7 +505,10 @@ void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } - put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + + if (src_info.ipi6_ifindex >= 0) + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, + sizeof(src_info), &src_info); } } -- cgit v1.2.3 From 892311f66f2411b813ca631009356891a0c2b0a1 Mon Sep 17 00:00:00 2001 From: Eyal Perry Date: Tue, 2 Dec 2014 18:12:10 +0200 Subject: ethtool: Support for configurable RSS hash function This patch extends the set/get_rxfh ethtool-options for getting or setting the RSS hash function. It modifies drivers implementation of set/get_rxfh accordingly. This change also delegates the responsibility of checking whether a modification to a certain RX flow hash parameter is supported to the driver implementation of set_rxfh. User-kernel API is done through the new hfunc bitmask field in the ethtool_rxfh struct. A bit set in the hfunc field is corresponding to an index in the new string-set ETH_SS_RSS_HASH_FUNCS. Got approval from most of the relevant driver maintainers that their driver is using Toeplitz, and for the few that didn't answered, also assumed it is Toeplitz. Cc: Tom Lendacky Cc: Ariel Elior Cc: Prashant Sreedharan Cc: Michael Chan Cc: Hariprasad S Cc: Sathya Perla Cc: Subbu Seetharaman Cc: Ajit Khaparde Cc: Jeff Kirsher Cc: Jesse Brandeburg Cc: Bruce Allan Cc: Carolyn Wyborny Cc: Don Skidmore Cc: Greg Rose Cc: Matthew Vick Cc: John Ronciak Cc: Mitch Williams Cc: Amir Vadai Cc: Solarflare linux maintainers Cc: Shradha Shah Cc: Shreyas Bhatewara Cc: "VMware, Inc." Cc: Ben Hutchings Signed-off-by: Eyal Perry Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 11 +++- .../net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 20 ++++++- drivers/net/ethernet/broadcom/tg3.c | 20 ++++++- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 18 +++++- drivers/net/ethernet/emulex/benet/be_ethtool.c | 12 +++- drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c | 12 +++- drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 17 +++++- drivers/net/ethernet/intel/igb/igb_ethtool.c | 16 ++++- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 13 +++- drivers/net/ethernet/sfc/ethtool.c | 18 ++++-- drivers/net/vmxnet3/vmxnet3_ethtool.c | 15 ++++- include/linux/ethtool.h | 42 +++++++++---- include/uapi/linux/ethtool.h | 10 +++- net/core/ethtool.c | 69 ++++++++++++---------- 14 files changed, 223 insertions(+), 70 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c index 95d44538357f..ebf489351555 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c @@ -511,7 +511,8 @@ static u32 xgbe_get_rxfh_indir_size(struct net_device *netdev) return ARRAY_SIZE(pdata->rss_table); } -static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) +static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) { struct xgbe_prv_data *pdata = netdev_priv(netdev); unsigned int i; @@ -525,16 +526,22 @@ static int xgbe_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) if (key) memcpy(key, pdata->rss_key, sizeof(pdata->rss_key)); + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + return 0; } static int xgbe_set_rxfh(struct net_device *netdev, const u32 *indir, - const u8 *key) + const u8 *key, const u8 hfunc) { struct xgbe_prv_data *pdata = netdev_priv(netdev); struct xgbe_hw_if *hw_if = &pdata->hw_if; unsigned int ret; + if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) + return -EOPNOTSUPP; + if (indir) { ret = hw_if->set_rss_lookup_table(pdata, indir); if (ret) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c index 1edc931b1458..ffe4e003e636 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c @@ -3358,12 +3358,18 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev) return T_ETH_INDIRECTION_TABLE_SIZE; } -static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key) +static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, + u8 *hfunc) { struct bnx2x *bp = netdev_priv(dev); u8 ind_table[T_ETH_INDIRECTION_TABLE_SIZE] = {0}; size_t i; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (!indir) + return 0; + /* Get the current configuration of the RSS indirection table */ bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table); @@ -3383,11 +3389,21 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key) } static int bnx2x_set_rxfh(struct net_device *dev, const u32 *indir, - const u8 *key) + const u8 *key, const u8 hfunc) { struct bnx2x *bp = netdev_priv(dev); size_t i; + /* We require at least one supported parameter to be changed and no + * change in any of the unsupported parameters + */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + + if (!indir) + return 0; + for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++) { /* * The same as in bnx2x_get_rxfh: we can't use a memcpy() diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 43fd1b72c1ea..bb48a610b72a 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -12561,22 +12561,38 @@ static u32 tg3_get_rxfh_indir_size(struct net_device *dev) return size; } -static int tg3_get_rxfh(struct net_device *dev, u32 *indir, u8 *key) +static int tg3_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, u8 *hfunc) { struct tg3 *tp = netdev_priv(dev); int i; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (!indir) + return 0; + for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++) indir[i] = tp->rss_ind_tbl[i]; return 0; } -static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key) +static int tg3_set_rxfh(struct net_device *dev, const u32 *indir, const u8 *key, + const u8 hfunc) { struct tg3 *tp = netdev_priv(dev); size_t i; + /* We require at least one supported parameter to be changed and no + * change in any of the unsupported parameters + */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + + if (!indir) + return 0; + for (i = 0; i < TG3_RSS_INDIR_TBL_SIZE; i++) tp->rss_ind_tbl[i] = indir[i]; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 3aea82bb9039..e7342bc85026 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -2923,21 +2923,35 @@ static u32 get_rss_table_size(struct net_device *dev) return pi->rss_size; } -static int get_rss_table(struct net_device *dev, u32 *p, u8 *key) +static int get_rss_table(struct net_device *dev, u32 *p, u8 *key, u8 *hfunc) { const struct port_info *pi = netdev_priv(dev); unsigned int n = pi->rss_size; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (!p) + return 0; while (n--) p[n] = pi->rss[n]; return 0; } -static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key) +static int set_rss_table(struct net_device *dev, const u32 *p, const u8 *key, + const u8 hfunc) { unsigned int i; struct port_info *pi = netdev_priv(dev); + /* We require at least one supported parameter to be changed and no + * change in any of the unsupported parameters + */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + if (!p) + return 0; + for (i = 0; i < pi->rss_size; i++) pi->rss[i] = p[i]; if (pi->adapter->flags & FULL_INIT_DONE) diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index e42a791c1835..73a500ccbf69 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -1171,7 +1171,8 @@ static u32 be_get_rxfh_key_size(struct net_device *netdev) return RSS_HASH_KEY_LEN; } -static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey) +static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey, + u8 *hfunc) { struct be_adapter *adapter = netdev_priv(netdev); int i; @@ -1185,16 +1186,23 @@ static int be_get_rxfh(struct net_device *netdev, u32 *indir, u8 *hkey) if (hkey) memcpy(hkey, rss->rss_hkey, RSS_HASH_KEY_LEN); + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + return 0; } static int be_set_rxfh(struct net_device *netdev, const u32 *indir, - const u8 *hkey) + const u8 *hkey, const u8 hfunc) { int rc = 0, i, j; struct be_adapter *adapter = netdev_priv(netdev); u8 rsstable[RSS_INDIR_TABLE_LEN]; + /* We do not allow change in unsupported parameters */ + if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) + return -EOPNOTSUPP; + if (indir) { struct be_rx_obj *rxo; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c index 2d04464e6aa3..651f53bc7376 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_ethtool.c @@ -916,11 +916,15 @@ static u32 fm10k_get_rssrk_size(struct net_device *netdev) return FM10K_RSSRK_SIZE * FM10K_RSSRK_ENTRIES_PER_REG; } -static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key) +static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) { struct fm10k_intfc *interface = netdev_priv(netdev); int i, err; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + err = fm10k_get_reta(netdev, indir); if (err || !key) return err; @@ -932,12 +936,16 @@ static int fm10k_get_rssh(struct net_device *netdev, u32 *indir, u8 *key) } static int fm10k_set_rssh(struct net_device *netdev, const u32 *indir, - const u8 *key) + const u8 *key, const u8 hfunc) { struct fm10k_intfc *interface = netdev_priv(netdev); struct fm10k_hw *hw = &interface->hw; int i, err; + /* We do not allow change in unsupported parameters */ + if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) + return -EOPNOTSUPP; + err = fm10k_set_reta(netdev, indir); if (err || !key) return err; diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c index 69a269b23be6..69b97bac182c 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c @@ -627,13 +627,19 @@ static u32 i40evf_get_rxfh_indir_size(struct net_device *netdev) * * Reads the indirection table directly from the hardware. Always returns 0. **/ -static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) +static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) { struct i40evf_adapter *adapter = netdev_priv(netdev); struct i40e_hw *hw = &adapter->hw; u32 hlut_val; int i, j; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (!indir) + return 0; + for (i = 0, j = 0; i <= I40E_VFQF_HLUT_MAX_INDEX; i++) { hlut_val = rd32(hw, I40E_VFQF_HLUT(i)); indir[j++] = hlut_val & 0xff; @@ -654,13 +660,20 @@ static int i40evf_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) * returns 0 after programming the table. **/ static int i40evf_set_rxfh(struct net_device *netdev, const u32 *indir, - const u8 *key) + const u8 *key, const u8 hfunc) { struct i40evf_adapter *adapter = netdev_priv(netdev); struct i40e_hw *hw = &adapter->hw; u32 hlut_val; int i, j; + /* We do not allow change in unsupported parameters */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + if (!indir) + return 0; + for (i = 0, j = 0; i <= I40E_VFQF_HLUT_MAX_INDEX; i++) { hlut_val = indir[j++]; hlut_val |= indir[j++] << 8; diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 02cfd3b14762..d5673eb90c54 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -2842,11 +2842,16 @@ static u32 igb_get_rxfh_indir_size(struct net_device *netdev) return IGB_RETA_SIZE; } -static int igb_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key) +static int igb_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, + u8 *hfunc) { struct igb_adapter *adapter = netdev_priv(netdev); int i; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (!indir) + return 0; for (i = 0; i < IGB_RETA_SIZE; i++) indir[i] = adapter->rss_indir_tbl[i]; @@ -2889,13 +2894,20 @@ void igb_write_rss_indir_tbl(struct igb_adapter *adapter) } static int igb_set_rxfh(struct net_device *netdev, const u32 *indir, - const u8 *key) + const u8 *key, const u8 hfunc) { struct igb_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; int i; u32 num_queues; + /* We do not allow change in unsupported parameters */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + if (!indir) + return 0; + num_queues = adapter->rss_queues; switch (hw->mac.type) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index c45e06abc073..28c3fc5a0791 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -978,7 +978,8 @@ static u32 mlx4_en_get_rxfh_key_size(struct net_device *netdev) return MLX4_EN_RSS_KEY_SIZE; } -static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key) +static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key, + u8 *hfunc) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_rss_map *rss_map = &priv->rss_map; @@ -990,16 +991,20 @@ static int mlx4_en_get_rxfh(struct net_device *dev, u32 *ring_index, u8 *key) rss_rings = 1 << ilog2(rss_rings); while (n--) { + if (!ring_index) + break; ring_index[n] = rss_map->qps[n % rss_rings].qpn - rss_map->base_qpn; } if (key) memcpy(key, priv->rss_key, MLX4_EN_RSS_KEY_SIZE); + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; return err; } static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index, - const u8 *key) + const u8 *key, const u8 hfunc) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; @@ -1008,6 +1013,10 @@ static int mlx4_en_set_rxfh(struct net_device *dev, const u32 *ring_index, int i; int rss_rings = 0; + /* We do not allow change in unsupported parameters */ + if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) + return -EOPNOTSUPP; + /* Calculate RSS table size and make sure flows are spread evenly * between rings */ diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index cad258a78708..4835bc0d0de8 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -1086,19 +1086,29 @@ static u32 efx_ethtool_get_rxfh_indir_size(struct net_device *net_dev) 0 : ARRAY_SIZE(efx->rx_indir_table)); } -static int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key) +static int efx_ethtool_get_rxfh(struct net_device *net_dev, u32 *indir, u8 *key, + u8 *hfunc) { struct efx_nic *efx = netdev_priv(net_dev); - memcpy(indir, efx->rx_indir_table, sizeof(efx->rx_indir_table)); + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (indir) + memcpy(indir, efx->rx_indir_table, sizeof(efx->rx_indir_table)); return 0; } -static int efx_ethtool_set_rxfh(struct net_device *net_dev, - const u32 *indir, const u8 *key) +static int efx_ethtool_set_rxfh(struct net_device *net_dev, const u32 *indir, + const u8 *key, const u8 hfunc) { struct efx_nic *efx = netdev_priv(net_dev); + /* We do not allow change in unsupported parameters */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + if (!indir) + return 0; memcpy(efx->rx_indir_table, indir, sizeof(efx->rx_indir_table)); efx->type->rx_push_rss_config(efx); return 0; diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c index b725fd9e7803..b7b53329d575 100644 --- a/drivers/net/vmxnet3/vmxnet3_ethtool.c +++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c @@ -583,12 +583,16 @@ vmxnet3_get_rss_indir_size(struct net_device *netdev) } static int -vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key) +vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key, u8 *hfunc) { struct vmxnet3_adapter *adapter = netdev_priv(netdev); struct UPT1_RSSConf *rssConf = adapter->rss_conf; unsigned int n = rssConf->indTableSize; + if (hfunc) + *hfunc = ETH_RSS_HASH_TOP; + if (!p) + return 0; while (n--) p[n] = rssConf->indTable[n]; return 0; @@ -596,13 +600,20 @@ vmxnet3_get_rss(struct net_device *netdev, u32 *p, u8 *key) } static int -vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key) +vmxnet3_set_rss(struct net_device *netdev, const u32 *p, const u8 *key, + const u8 hfunc) { unsigned int i; unsigned long flags; struct vmxnet3_adapter *adapter = netdev_priv(netdev); struct UPT1_RSSConf *rssConf = adapter->rss_conf; + /* We do not allow change in unsupported parameters */ + if (key || + (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)) + return -EOPNOTSUPP; + if (!p) + return 0; for (i = 0; i < rssConf->indTableSize; i++) rssConf->indTable[i] = p[i]; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c1a2d60dfb82..653dc9c4ebac 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -59,6 +59,26 @@ enum ethtool_phys_id_state { ETHTOOL_ID_OFF }; +enum { + ETH_RSS_HASH_TOP_BIT, /* Configurable RSS hash function - Toeplitz */ + ETH_RSS_HASH_XOR_BIT, /* Configurable RSS hash function - Xor */ + + /* + * Add your fresh new hash function bits above and remember to update + * rss_hash_func_strings[] in ethtool.c + */ + ETH_RSS_HASH_FUNCS_COUNT +}; + +#define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) +#define __ETH_RSS_HASH(name) __ETH_RSS_HASH_BIT(ETH_RSS_HASH_##name##_BIT) + +#define ETH_RSS_HASH_TOP __ETH_RSS_HASH(TOP) +#define ETH_RSS_HASH_XOR __ETH_RSS_HASH(XOR) + +#define ETH_RSS_HASH_UNKNOWN 0 +#define ETH_RSS_HASH_NO_CHANGE 0 + struct net_device; /* Some generic methods drivers may use in their ethtool_ops */ @@ -158,17 +178,14 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) * Returns zero if not supported for this specific device. * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table. * Returns zero if not supported for this specific device. - * @get_rxfh: Get the contents of the RX flow hash indirection table and hash - * key. - * Will only be called if one or both of @get_rxfh_indir_size and - * @get_rxfh_key_size are implemented and return non-zero. - * Returns a negative error code or zero. - * @set_rxfh: Set the contents of the RX flow hash indirection table and/or - * hash key. In case only the indirection table or hash key is to be - * changed, the other argument will be %NULL. - * Will only be called if one or both of @get_rxfh_indir_size and - * @get_rxfh_key_size are implemented and return non-zero. + * @get_rxfh: Get the contents of the RX flow hash indirection table, hash key + * and/or hash function. * Returns a negative error code or zero. + * @set_rxfh: Set the contents of the RX flow hash indirection table, hash + * key, and/or hash function. Arguments which are set to %NULL or zero + * will remain unchanged. + * Returns a negative error code or zero. An error code must be returned + * if at least one unsupported change was requested. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. @@ -241,9 +258,10 @@ struct ethtool_ops { int (*reset)(struct net_device *, u32 *); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); - int (*get_rxfh)(struct net_device *, u32 *indir, u8 *key); + int (*get_rxfh)(struct net_device *, u32 *indir, u8 *key, + u8 *hfunc); int (*set_rxfh)(struct net_device *, const u32 *indir, - const u8 *key); + const u8 *key, const u8 hfunc); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index eb2095b42fbb..5f66d9c2889d 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -534,6 +534,7 @@ struct ethtool_pauseparam { * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE; * now deprecated * @ETH_SS_FEATURES: Device feature names + * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names */ enum ethtool_stringset { ETH_SS_TEST = 0, @@ -541,6 +542,7 @@ enum ethtool_stringset { ETH_SS_PRIV_FLAGS, ETH_SS_NTUPLE_FILTERS, ETH_SS_FEATURES, + ETH_SS_RSS_HASH_FUNCS, }; /** @@ -884,6 +886,8 @@ struct ethtool_rxfh_indir { * @key_size: On entry, the array size of the user buffer for the hash key, * which may be zero. On return from %ETHTOOL_GRSSH, the size of the * hardware hash key. + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. * @rsvd: Reserved for future extensions. * @rss_config: RX ring/queue index for each hash value i.e., indirection table * of @indir_size __u32 elements, followed by hash key of @key_size @@ -893,14 +897,16 @@ struct ethtool_rxfh_indir { * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested * and a @indir_size of zero means the indir table should be reset to default - * values. + * values. An hfunc of zero means that hash function setting is not requested. */ struct ethtool_rxfh { __u32 cmd; __u32 rss_context; __u32 indir_size; __u32 key_size; - __u32 rsvd[2]; + __u8 hfunc; + __u8 rsvd8[3]; + __u32 rsvd32; __u32 rss_config[0]; }; #define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 715f51f321e9..550892cd6b3f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -100,6 +100,12 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; +static const char +rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { + [ETH_RSS_HASH_TOP_BIT] = "toeplitz", + [ETH_RSS_HASH_XOR_BIT] = "xor", +}; + static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { @@ -185,6 +191,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset) if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); + if (sset == ETH_SS_RSS_HASH_FUNCS) + return ARRAY_SIZE(rss_hash_func_strings); + if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else @@ -199,6 +208,9 @@ static void __ethtool_get_strings(struct net_device *dev, if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); + else if (stringset == ETH_SS_RSS_HASH_FUNCS) + memcpy(data, rss_hash_func_strings, + sizeof(rss_hash_func_strings)); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); @@ -618,7 +630,7 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, if (!indir) return -ENOMEM; - ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL); + ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; @@ -679,7 +691,7 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, goto out; } - ret = ops->set_rxfh(dev, indir, NULL); + ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); out: kfree(indir); @@ -697,12 +709,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, u32 total_size; u32 indir_bytes; u32 *indir = NULL; + u8 dev_hfunc = 0; u8 *hkey = NULL; u8 *rss_config; - if (!(dev->ethtool_ops->get_rxfh_indir_size || - dev->ethtool_ops->get_rxfh_key_size) || - !dev->ethtool_ops->get_rxfh) + if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -710,16 +721,14 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; - if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; rxfh.indir_size = dev_indir_size; @@ -727,13 +736,6 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; - /* If the user buffer size is 0, this is just a query for the - * device table size and key size. Otherwise, if the User size is - * not equal to device table size or key size it's an error. - */ - if (!user_indir_size && !user_key_size) - return 0; - if ((user_indir_size && (user_indir_size != dev_indir_size)) || (user_key_size && (user_key_size != dev_key_size))) return -EINVAL; @@ -750,14 +752,19 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, if (user_key_size) hkey = rss_config + indir_bytes; - ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey); - if (!ret) { - if (copy_to_user(useraddr + - offsetof(struct ethtool_rxfh, rss_config[0]), - rss_config, total_size)) - ret = -EFAULT; - } + ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); + if (ret) + goto out; + if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), + &dev_hfunc, sizeof(rxfh.hfunc))) { + ret = -EFAULT; + } else if (copy_to_user(useraddr + + offsetof(struct ethtool_rxfh, rss_config[0]), + rss_config, total_size)) { + ret = -EFAULT; + } +out: kfree(rss_config); return ret; @@ -776,33 +783,31 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); - if (!(ops->get_rxfh_indir_size || ops->get_rxfh_key_size) || - !ops->get_rxnfc || !ops->set_rxfh) + if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = dev->ethtool_ops->get_rxfh_key_size(dev); - if ((dev_key_size + dev_indir_size) == 0) - return -EOPNOTSUPP; if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ - if (rxfh.rss_context || rxfh.rsvd[0] || rxfh.rsvd[1]) + if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] || + rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; - /* If either indir or hash key is valid, proceed further. - * It is not valid to request that both be unchanged. + /* If either indir, hash key or function is valid, proceed further. + * Must request at least one change: indir size, hash key or function. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && - rxfh.key_size == 0)) + rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) @@ -845,7 +850,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, } } - ret = ops->set_rxfh(dev, indir, hkey); + ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); out: kfree(rss_config); -- cgit v1.2.3 From af6c9f1657ca6d2ef2b2c0e31ad17c6fbf773baf Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 19 Nov 2014 18:11:00 -0800 Subject: thermal: provide an UAPI header file include/linux/thermal.h contains definitions for the Thermal generic netlink family, but none of the valuable information relevant to user-space such as the Genl family name, multicast group, version or command set and data types is exported to user-space. Export all the relevant generic netlink information to user-space to make this genl family usable by user-space, and while at it, export THERMAL_NAME_LENGTH since it limits name length for thermal_hwmon devices. Kbuild and MAINTAINERS are also updated accordingly to reflect this new file: include/uapi/linux/thermal.h. Signed-off-by: Florian Fainelli Signed-off-by: Zhang Rui --- MAINTAINERS | 1 + include/linux/thermal.h | 31 +------------------------------ include/uapi/linux/Kbuild | 1 + include/uapi/linux/thermal.h | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 30 deletions(-) create mode 100644 include/uapi/linux/thermal.h (limited to 'include/uapi') diff --git a/MAINTAINERS b/MAINTAINERS index 0ff630de8a6d..9c709f503c03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9295,6 +9295,7 @@ Q: https://patchwork.kernel.org/project/linux-pm/list/ S: Supported F: drivers/thermal/ F: include/linux/thermal.h +F: include/uapi/linux/thermal.h F: include/linux/cpu_cooling.h F: Documentation/devicetree/bindings/thermal/ diff --git a/include/linux/thermal.h b/include/linux/thermal.h index ef90838b36a0..70f87406bed9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -29,10 +29,10 @@ #include #include #include +#include #define THERMAL_TRIPS_NONE -1 #define THERMAL_MAX_TRIPS 12 -#define THERMAL_NAME_LENGTH 20 /* invalid cooling state */ #define THERMAL_CSTATE_INVALID -1UL @@ -49,11 +49,6 @@ #define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off)) #define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732) -/* Adding event notification support elements */ -#define THERMAL_GENL_FAMILY_NAME "thermal_event" -#define THERMAL_GENL_VERSION 0x01 -#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" - /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) #define DEFAULT_THERMAL_GOVERNOR "step_wise" @@ -86,30 +81,6 @@ enum thermal_trend { THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */ }; -/* Events supported by Thermal Netlink */ -enum events { - THERMAL_AUX0, - THERMAL_AUX1, - THERMAL_CRITICAL, - THERMAL_DEV_FAULT, -}; - -/* attributes of thermal_genl_family */ -enum { - THERMAL_GENL_ATTR_UNSPEC, - THERMAL_GENL_ATTR_EVENT, - __THERMAL_GENL_ATTR_MAX, -}; -#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) - -/* commands supported by the thermal_genl_family */ -enum { - THERMAL_GENL_CMD_UNSPEC, - THERMAL_GENL_CMD_EVENT, - __THERMAL_GENL_CMD_MAX, -}; -#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1) - struct thermal_zone_device_ops { int (*bind) (struct thermal_zone_device *, struct thermal_cooling_device *); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 4c94f31a8c99..a1943e2d1264 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -383,6 +383,7 @@ header-y += tcp.h header-y += tcp_metrics.h header-y += telephony.h header-y += termios.h +header-y += thermal.h header-y += time.h header-y += times.h header-y += timex.h diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h new file mode 100644 index 000000000000..ac5535855982 --- /dev/null +++ b/include/uapi/linux/thermal.h @@ -0,0 +1,35 @@ +#ifndef _UAPI_LINUX_THERMAL_H +#define _UAPI_LINUX_THERMAL_H + +#define THERMAL_NAME_LENGTH 20 + +/* Adding event notification support elements */ +#define THERMAL_GENL_FAMILY_NAME "thermal_event" +#define THERMAL_GENL_VERSION 0x01 +#define THERMAL_GENL_MCAST_GROUP_NAME "thermal_mc_grp" + +/* Events supported by Thermal Netlink */ +enum events { + THERMAL_AUX0, + THERMAL_AUX1, + THERMAL_CRITICAL, + THERMAL_DEV_FAULT, +}; + +/* attributes of thermal_genl_family */ +enum { + THERMAL_GENL_ATTR_UNSPEC, + THERMAL_GENL_ATTR_EVENT, + __THERMAL_GENL_ATTR_MAX, +}; +#define THERMAL_GENL_ATTR_MAX (__THERMAL_GENL_ATTR_MAX - 1) + +/* commands supported by the thermal_genl_family */ +enum { + THERMAL_GENL_CMD_UNSPEC, + THERMAL_GENL_CMD_EVENT, + __THERMAL_GENL_CMD_MAX, +}; +#define THERMAL_GENL_CMD_MAX (__THERMAL_GENL_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_THERMAL_H */ -- cgit v1.2.3 From 4ec22faeb2a165b6ce7009f91309046010b3f282 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Oct 2014 15:49:34 +0300 Subject: virtio: add virtio 1.0 feature bit Based on original patches by Rusty Russell, Thomas Huth and Cornelia Huck. Note: at this time, we do not negotiate this feature bit in core, drivers have to declare VERSION_1 support explicitly. For this reason we treat this bit as a device bit and not as a transport bit for now. After all drivers are converted, we will be able to move VERSION_1 to core and drop it from all drivers. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/uapi/linux/virtio_config.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 3ce768c6910d..80e7381e3f1f 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -54,4 +54,7 @@ /* Can the device handle any descriptor layout? */ #define VIRTIO_F_ANY_LAYOUT 27 +/* v1.0 compliant. */ +#define VIRTIO_F_VERSION_1 32 + #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */ -- cgit v1.2.3 From eef960a04354d13426c43a4e3750a5e2b383040c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Oct 2014 15:35:56 +0300 Subject: virtio: memory access APIs virtio 1.0 makes all memory structures LE, so we need APIs to conditionally do a byteswap on BE architectures. To make it easier to check code statically, add virtio specific types for multi-byte integers in memory. Add low level wrappers that do a byteswap conditionally, these will be useful e.g. for vhost. Add high level wrappers that query device endian-ness and act accordingly. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- include/linux/virtio_byteorder.h | 59 +++++++++++++++++++++++++++++++++++++++ include/linux/virtio_config.h | 32 +++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/virtio_ring.h | 45 ++++++++++++++--------------- include/uapi/linux/virtio_types.h | 46 ++++++++++++++++++++++++++++++ 5 files changed, 161 insertions(+), 22 deletions(-) create mode 100644 include/linux/virtio_byteorder.h create mode 100644 include/uapi/linux/virtio_types.h (limited to 'include/uapi') diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h new file mode 100644 index 000000000000..51865d05b267 --- /dev/null +++ b/include/linux/virtio_byteorder.h @@ -0,0 +1,59 @@ +#ifndef _LINUX_VIRTIO_BYTEORDER_H +#define _LINUX_VIRTIO_BYTEORDER_H +#include +#include + +/* + * Low-level memory accessors for handling virtio in modern little endian and in + * compatibility native endian format. + */ + +static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val) +{ + if (little_endian) + return le16_to_cpu((__force __le16)val); + else + return (__force u16)val; +} + +static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val) +{ + if (little_endian) + return (__force __virtio16)cpu_to_le16(val); + else + return (__force __virtio16)val; +} + +static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val) +{ + if (little_endian) + return le32_to_cpu((__force __le32)val); + else + return (__force u32)val; +} + +static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val) +{ + if (little_endian) + return (__force __virtio32)cpu_to_le32(val); + else + return (__force __virtio32)val; +} + +static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val) +{ + if (little_endian) + return le64_to_cpu((__force __le64)val); + else + return (__force u64)val; +} + +static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val) +{ + if (little_endian) + return (__force __virtio64)cpu_to_le64(val); + else + return (__force __virtio64)val; +} + +#endif /* _LINUX_VIRTIO_BYTEORDER */ diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index f51788439574..02f0acb549d6 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -4,6 +4,7 @@ #include #include #include +#include #include /** @@ -199,6 +200,37 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu) return 0; } +/* Memory accessors */ +static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val) +{ + return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); +} + +static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val) +{ + return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); +} + +static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val) +{ + return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); +} + +static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val) +{ + return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); +} + +static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val) +{ + return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); +} + +static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val) +{ + return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val); +} + /* Config space accessors. */ #define virtio_cread(vdev, structname, member, ptr) \ do { \ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 8523f9bb72f2..615f96a6022a 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -423,6 +423,7 @@ header-y += virtio_blk.h header-y += virtio_config.h header-y += virtio_console.h header-y += virtio_ids.h +header-y += virtio_types.h header-y += virtio_net.h header-y += virtio_pci.h header-y += virtio_ring.h diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index a99f9b7caa67..61c818a7fe70 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -32,6 +32,7 @@ * * Copyright Rusty Russell IBM Corporation 2007. */ #include +#include /* This marks a buffer as continuing via the next field. */ #define VRING_DESC_F_NEXT 1 @@ -61,32 +62,32 @@ /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ - __u64 addr; + __virtio64 addr; /* Length. */ - __u32 len; + __virtio32 len; /* The flags as indicated above. */ - __u16 flags; + __virtio16 flags; /* We chain unused descriptors via this, too */ - __u16 next; + __virtio16 next; }; struct vring_avail { - __u16 flags; - __u16 idx; - __u16 ring[]; + __virtio16 flags; + __virtio16 idx; + __virtio16 ring[]; }; /* u32 is used here for ids for padding reasons. */ struct vring_used_elem { /* Index of start of used descriptor chain. */ - __u32 id; + __virtio32 id; /* Total length of the descriptor chain which was used (written to) */ - __u32 len; + __virtio32 len; }; struct vring_used { - __u16 flags; - __u16 idx; + __virtio16 flags; + __virtio16 idx; struct vring_used_elem ring[]; }; @@ -109,25 +110,25 @@ struct vring { * struct vring_desc desc[num]; * * // A ring of available descriptor heads with free-running index. - * __u16 avail_flags; - * __u16 avail_idx; - * __u16 available[num]; - * __u16 used_event_idx; + * __virtio16 avail_flags; + * __virtio16 avail_idx; + * __virtio16 available[num]; + * __virtio16 used_event_idx; * * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. - * __u16 used_flags; - * __u16 used_idx; + * __virtio16 used_flags; + * __virtio16 used_idx; * struct vring_used_elem used[num]; - * __u16 avail_event_idx; + * __virtio16 avail_event_idx; * }; */ /* We publish the used event index at the end of the available ring, and vice * versa. They are at the end for backwards compatibility. */ #define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) -#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num]) +#define vring_avail_event(vr) (*(__virtio16 *)&(vr)->used->ring[(vr)->num]) static inline void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long align) @@ -135,15 +136,15 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, vr->num = num; vr->desc = p; vr->avail = p + num*sizeof(struct vring_desc); - vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__u16) + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } static inline unsigned vring_size(unsigned int num, unsigned long align) { - return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (3 + num) + return ((sizeof(struct vring_desc) * num + sizeof(__virtio16) * (3 + num) + align - 1) & ~(align - 1)) - + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num; + + sizeof(__virtio16) * 3 + sizeof(struct vring_used_elem) * num; } /* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ diff --git a/include/uapi/linux/virtio_types.h b/include/uapi/linux/virtio_types.h new file mode 100644 index 000000000000..e845e8c4cbee --- /dev/null +++ b/include/uapi/linux/virtio_types.h @@ -0,0 +1,46 @@ +#ifndef _UAPI_LINUX_VIRTIO_TYPES_H +#define _UAPI_LINUX_VIRTIO_TYPES_H +/* Type definitions for virtio implementations. + * + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (C) 2014 Red Hat, Inc. + * Author: Michael S. Tsirkin + */ +#include + +/* + * __virtio{16,32,64} have the following meaning: + * - __u{16,32,64} for virtio devices in legacy mode, accessed in native endian + * - __le{16,32,64} for standard-compliant virtio devices + */ + +typedef __u16 __bitwise__ __virtio16; +typedef __u32 __bitwise__ __virtio32; +typedef __u64 __bitwise__ __virtio64; + +#endif /* _UAPI_LINUX_VIRTIO_TYPES_H */ -- cgit v1.2.3 From cb3f6d9da49929797f806b4e91ec005e6fc2e9c2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 22 Oct 2014 17:41:38 +0300 Subject: virtio: set FEATURES_OK set FEATURES_OK as per virtio 1.0 spec Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- drivers/virtio/virtio.c | 29 ++++++++++++++++++++++------- include/uapi/linux/virtio_config.h | 2 ++ 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 746d350c8062..9248125d29c9 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -160,6 +160,7 @@ static int virtio_dev_probe(struct device *_d) struct virtio_device *dev = dev_to_virtio(_d); struct virtio_driver *drv = drv_to_virtio(dev->dev.driver); u64 device_features; + unsigned status; /* We have a driver! */ add_status(dev, VIRTIO_CONFIG_S_DRIVER); @@ -183,18 +184,32 @@ static int virtio_dev_probe(struct device *_d) dev->config->finalize_features(dev); + if (virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { + add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + status = dev->config->get_status(dev); + if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { + dev_err(_d, "virtio: device refuses features: %x\n", + status); + err = -ENODEV; + goto err; + } + } + err = drv->probe(dev); if (err) - add_status(dev, VIRTIO_CONFIG_S_FAILED); - else { - add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); - if (drv->scan) - drv->scan(dev); + goto err; - virtio_config_enable(dev); - } + add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); + if (drv->scan) + drv->scan(dev); + + virtio_config_enable(dev); + return 0; +err: + add_status(dev, VIRTIO_CONFIG_S_FAILED); return err; + } static int virtio_dev_remove(struct device *_d) diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 80e7381e3f1f..4d05671fbaec 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -38,6 +38,8 @@ #define VIRTIO_CONFIG_S_DRIVER 2 /* Driver has used its parts of the config, and is happy */ #define VIRTIO_CONFIG_S_DRIVER_OK 4 +/* Driver has finished configuring features */ +#define VIRTIO_CONFIG_S_FEATURES_OK 8 /* We've given up on this device. */ #define VIRTIO_CONFIG_S_FAILED 0x80 -- cgit v1.2.3 From fdd819b21576c361bf0dcdd9522df4ccabf7aaa8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 7 Oct 2014 16:39:48 +0200 Subject: virtio_net: v1.0 endianness Based on patches by Rusty Russell, Cornelia Huck. Note: more code changes are needed for 1.0 support (due to different header size). So we don't advertize support for 1.0 yet. Signed-off-by: Rusty Russell Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 33 ++++++++++++++++++++------------- include/uapi/linux/virtio_net.h | 15 ++++++++------- 2 files changed, 28 insertions(+), 20 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b0bc8ead47de..c07e0302438e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -347,13 +347,14 @@ err: } static struct sk_buff *receive_mergeable(struct net_device *dev, + struct virtnet_info *vi, struct receive_queue *rq, unsigned long ctx, unsigned int len) { void *buf = mergeable_ctx_to_buf_address(ctx); struct skb_vnet_hdr *hdr = buf; - int num_buf = hdr->mhdr.num_buffers; + u16 num_buf = virtio16_to_cpu(rq->vq->vdev, hdr->mhdr.num_buffers); struct page *page = virt_to_head_page(buf); int offset = buf - page_address(page); unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx)); @@ -369,7 +370,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, ctx = (unsigned long)virtqueue_get_buf(rq->vq, &len); if (unlikely(!ctx)) { pr_debug("%s: rx error: %d buffers out of %d missing\n", - dev->name, num_buf, hdr->mhdr.num_buffers); + dev->name, num_buf, + virtio16_to_cpu(rq->vq->vdev, + hdr->mhdr.num_buffers)); dev->stats.rx_length_errors++; goto err_buf; } @@ -454,7 +457,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) } if (vi->mergeable_rx_bufs) - skb = receive_mergeable(dev, rq, (unsigned long)buf, len); + skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len); else if (vi->big_packets) skb = receive_big(dev, rq, buf, len); else @@ -473,8 +476,8 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { pr_debug("Needs csum!\n"); if (!skb_partial_csum_set(skb, - hdr->hdr.csum_start, - hdr->hdr.csum_offset)) + virtio16_to_cpu(vi->vdev, hdr->hdr.csum_start), + virtio16_to_cpu(vi->vdev, hdr->hdr.csum_offset))) goto frame_err; } else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) { skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -514,7 +517,8 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len) if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - skb_shinfo(skb)->gso_size = hdr->hdr.gso_size; + skb_shinfo(skb)->gso_size = virtio16_to_cpu(vi->vdev, + hdr->hdr.gso_size); if (skb_shinfo(skb)->gso_size == 0) { net_warn_ratelimited("%s: zero gso size.\n", dev->name); goto frame_err; @@ -876,16 +880,19 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) if (skb->ip_summed == CHECKSUM_PARTIAL) { hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; - hdr->hdr.csum_start = skb_checksum_start_offset(skb); - hdr->hdr.csum_offset = skb->csum_offset; + hdr->hdr.csum_start = cpu_to_virtio16(vi->vdev, + skb_checksum_start_offset(skb)); + hdr->hdr.csum_offset = cpu_to_virtio16(vi->vdev, + skb->csum_offset); } else { hdr->hdr.flags = 0; hdr->hdr.csum_offset = hdr->hdr.csum_start = 0; } if (skb_is_gso(skb)) { - hdr->hdr.hdr_len = skb_headlen(skb); - hdr->hdr.gso_size = skb_shinfo(skb)->gso_size; + hdr->hdr.hdr_len = cpu_to_virtio16(vi->vdev, skb_headlen(skb)); + hdr->hdr.gso_size = cpu_to_virtio16(vi->vdev, + skb_shinfo(skb)->gso_size); if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) @@ -1112,7 +1119,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; - s.virtqueue_pairs = queue_pairs; + s.virtqueue_pairs = cpu_to_virtio16(vi->vdev, queue_pairs); sg_init_one(&sg, &s, sizeof(s)); if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ, @@ -1189,7 +1196,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) sg_init_table(sg, 2); /* Store the unicast list and count in the front of the buffer */ - mac_data->entries = uc_count; + mac_data->entries = cpu_to_virtio32(vi->vdev, uc_count); i = 0; netdev_for_each_uc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); @@ -1200,7 +1207,7 @@ static void virtnet_set_rx_mode(struct net_device *dev) /* multicast list and count fill the end */ mac_data = (void *)&mac_data->macs[uc_count][0]; - mac_data->entries = mc_count; + mac_data->entries = cpu_to_virtio32(vi->vdev, mc_count); i = 0; netdev_for_each_mc_addr(ha, dev) memcpy(&mac_data->macs[i++][0], ha->addr, ETH_ALEN); diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 172a7f00780c..b5f1677b291c 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -28,6 +28,7 @@ #include #include #include +#include #include /* The feature bitmap for virtio net */ @@ -84,17 +85,17 @@ struct virtio_net_hdr { #define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP #define VIRTIO_NET_HDR_GSO_ECN 0x80 // TCP has ECN set __u8 gso_type; - __u16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ - __u16 gso_size; /* Bytes to append to hdr_len per frame */ - __u16 csum_start; /* Position to start checksumming from */ - __u16 csum_offset; /* Offset after that to place checksum */ + __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ + __virtio16 gso_size; /* Bytes to append to hdr_len per frame */ + __virtio16 csum_start; /* Position to start checksumming from */ + __virtio16 csum_offset; /* Offset after that to place checksum */ }; /* This is the version of the header to use when the MRG_RXBUF * feature has been negotiated. */ struct virtio_net_hdr_mrg_rxbuf { struct virtio_net_hdr hdr; - __u16 num_buffers; /* Number of merged rx buffers */ + __virtio16 num_buffers; /* Number of merged rx buffers */ }; /* @@ -149,7 +150,7 @@ typedef __u8 virtio_net_ctrl_ack; * VIRTIO_NET_F_CTRL_MAC_ADDR feature is available. */ struct virtio_net_ctrl_mac { - __u32 entries; + __virtio32 entries; __u8 macs[][ETH_ALEN]; } __attribute__((packed)); @@ -193,7 +194,7 @@ struct virtio_net_ctrl_mac { * specified. */ struct virtio_net_ctrl_mq { - __u16 virtqueue_pairs; + __virtio16 virtqueue_pairs; }; #define VIRTIO_NET_CTRL_MQ 4 -- cgit v1.2.3 From 19c1c5a64c3b8eeb65b63abba248b880dd584589 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 7 Oct 2014 16:39:49 +0200 Subject: virtio_blk: v1.0 support Based on patch by Cornelia Huck. Note: for consistency, and to avoid sparse errors, convert all fields, even those no longer in use for virtio v1.0. Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 70 ++++++++++++++++++++++++----------------- include/uapi/linux/virtio_blk.h | 15 ++++----- 2 files changed, 49 insertions(+), 36 deletions(-) (limited to 'include/uapi') diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c6a27d54ad62..f601f16eabf4 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -80,7 +80,7 @@ static int __virtblk_add_req(struct virtqueue *vq, { struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; unsigned int num_out = 0, num_in = 0; - int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; + __virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT); sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &hdr; @@ -91,19 +91,19 @@ static int __virtblk_add_req(struct virtqueue *vq, * block, and before the normal inhdr we put the sense data and the * inhdr with additional status information. */ - if (type == VIRTIO_BLK_T_SCSI_CMD) { + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); sgs[num_out++] = &cmd; } if (have_data) { - if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) sgs[num_out++] = data_sg; else sgs[num_out + num_in++] = data_sg; } - if (type == VIRTIO_BLK_T_SCSI_CMD) { + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); sgs[num_out + num_in++] = &sense; sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); @@ -119,12 +119,13 @@ static int __virtblk_add_req(struct virtqueue *vq, static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + struct virtio_blk *vblk = req->q->queuedata; int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { - req->resid_len = vbr->in_hdr.residual; - req->sense_len = vbr->in_hdr.sense_len; - req->errors = vbr->in_hdr.errors; + req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); + req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); + req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); } else if (req->cmd_type == REQ_TYPE_SPECIAL) { req->errors = (error != 0); } @@ -173,25 +174,25 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, vbr->req = req; if (req->cmd_flags & REQ_FLUSH) { - vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); } else { switch (req->cmd_type) { case REQ_TYPE_FS: vbr->out_hdr.type = 0; - vbr->out_hdr.sector = blk_rq_pos(vbr->req); - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req)); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; case REQ_TYPE_BLOCK_PC: - vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; case REQ_TYPE_SPECIAL: - vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; default: /* We don't put anything else in the queue. */ @@ -204,9 +205,9 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); else - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); @@ -476,7 +477,8 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) struct virtio_blk_config, wce, &writeback); if (err) - writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) || + virtio_has_feature(vdev, VIRTIO_F_VERSION_1); return writeback; } @@ -821,25 +823,35 @@ static const struct virtio_device_id id_table[] = { { 0 }, }; -static unsigned int features[] = { +static unsigned int features_legacy[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, +} +; +static unsigned int features[] = { + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, + VIRTIO_BLK_F_TOPOLOGY, + VIRTIO_BLK_F_MQ, + VIRTIO_F_VERSION_1, }; static struct virtio_driver virtio_blk = { - .feature_table = features, - .feature_table_size = ARRAY_SIZE(features), - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = virtblk_probe, - .remove = virtblk_remove, - .config_changed = virtblk_config_changed, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .feature_table_legacy = features_legacy, + .feature_table_size_legacy = ARRAY_SIZE(features_legacy), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtblk_probe, + .remove = virtblk_remove, + .config_changed = virtblk_config_changed, #ifdef CONFIG_PM_SLEEP - .freeze = virtblk_freeze, - .restore = virtblk_restore, + .freeze = virtblk_freeze, + .restore = virtblk_restore, #endif }; diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h index 9ad67b267584..247c8ba8544a 100644 --- a/include/uapi/linux/virtio_blk.h +++ b/include/uapi/linux/virtio_blk.h @@ -28,6 +28,7 @@ #include #include #include +#include /* Feature bits */ #define VIRTIO_BLK_F_BARRIER 0 /* Does host support barriers? */ @@ -114,18 +115,18 @@ struct virtio_blk_config { /* This is the first element of the read scatter-gather list. */ struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ - __u32 type; + __virtio32 type; /* io priority. */ - __u32 ioprio; + __virtio32 ioprio; /* Sector (ie. 512 byte offset) */ - __u64 sector; + __virtio64 sector; }; struct virtio_scsi_inhdr { - __u32 errors; - __u32 data_len; - __u32 sense_len; - __u32 residual; + __virtio32 errors; + __virtio32 data_len; + __virtio32 sense_len; + __virtio32 residual; }; /* And this is the final byte of the write scatter-gather list. */ -- cgit v1.2.3 From 031f5e0338ef672e728c878129fa044e8830c31a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 19 Nov 2014 14:44:40 +0200 Subject: tun: move internal flag defines out of uapi TUN_ flags are internal and never exposed to userspace. Any application using it is almost certainly buggy. Move them out to tun.c. Note: we remove these completely in follow-up patches, this code movement is split out for ease of review. Signed-off-by: Michael S. Tsirkin --- drivers/net/tun.c | 72 +++++++++++++-------------------------------- include/uapi/linux/if_tun.h | 16 ++-------- 2 files changed, 24 insertions(+), 64 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746994a4..b4e4ca01facc 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -103,6 +103,21 @@ do { \ } while (0) #endif +/* TUN device flags */ + +/* IFF_ATTACH_QUEUE is never stored in device flags, + * overload it to mean fasync when stored there. + */ +#define TUN_FASYNC IFF_ATTACH_QUEUE +#define TUN_NO_PI IFF_NO_PI +/* This flag has no real effect */ +#define TUN_ONE_QUEUE IFF_ONE_QUEUE +#define TUN_PERSIST IFF_PERSIST +#define TUN_VNET_HDR IFF_VNET_HDR +#define TUN_TAP_MQ IFF_MULTI_QUEUE + +#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ + IFF_MULTI_QUEUE) #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -1521,32 +1536,7 @@ static struct proto tun_proto = { static int tun_flags(struct tun_struct *tun) { - int flags = 0; - - if (tun->flags & TUN_TUN_DEV) - flags |= IFF_TUN; - else - flags |= IFF_TAP; - - if (tun->flags & TUN_NO_PI) - flags |= IFF_NO_PI; - - /* This flag has no real effect. We track the value for backwards - * compatibility. - */ - if (tun->flags & TUN_ONE_QUEUE) - flags |= IFF_ONE_QUEUE; - - if (tun->flags & TUN_VNET_HDR) - flags |= IFF_VNET_HDR; - - if (tun->flags & TUN_TAP_MQ) - flags |= IFF_MULTI_QUEUE; - - if (tun->flags & TUN_PERSIST) - flags |= IFF_PERSIST; - - return flags; + return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP); } static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr, @@ -1706,28 +1696,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun_debug(KERN_INFO, tun, "tun_set_iff\n"); - if (ifr->ifr_flags & IFF_NO_PI) - tun->flags |= TUN_NO_PI; - else - tun->flags &= ~TUN_NO_PI; - - /* This flag has no real effect. We track the value for backwards - * compatibility. - */ - if (ifr->ifr_flags & IFF_ONE_QUEUE) - tun->flags |= TUN_ONE_QUEUE; - else - tun->flags &= ~TUN_ONE_QUEUE; - - if (ifr->ifr_flags & IFF_VNET_HDR) - tun->flags |= TUN_VNET_HDR; - else - tun->flags &= ~TUN_VNET_HDR; - - if (ifr->ifr_flags & IFF_MULTI_QUEUE) - tun->flags |= TUN_TAP_MQ; - else - tun->flags &= ~TUN_TAP_MQ; + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); /* Make sure persistent devices do not get stuck in * xoff state. @@ -1890,9 +1860,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, if (cmd == TUNGETFEATURES) { /* Currently this just means: "what IFF flags are valid?". * This is needed because we never checked for invalid flags on - * TUNSETIFF. */ - return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | - IFF_VNET_HDR | IFF_MULTI_QUEUE, + * TUNSETIFF. + */ + return put_user(IFF_TUN | IFF_TAP | TUN_FEATURES, (unsigned int __user*)argp); } else if (cmd == TUNSETQUEUE) return tun_set_queue(file, &ifr); diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index e9502dd1ee2c..277a2607d166 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -22,21 +22,11 @@ /* Read queue size */ #define TUN_READQ_SIZE 500 - -/* TUN device flags */ -#define TUN_TUN_DEV 0x0001 -#define TUN_TAP_DEV 0x0002 +/* TUN device type flags: deprecated. Use IFF_TUN/IFF_TAP instead. */ +#define TUN_TUN_DEV IFF_TUN +#define TUN_TAP_DEV IFF_TAP #define TUN_TYPE_MASK 0x000f -#define TUN_FASYNC 0x0010 -#define TUN_NOCHECKSUM 0x0020 -#define TUN_NO_PI 0x0040 -/* This flag has no real effect */ -#define TUN_ONE_QUEUE 0x0080 -#define TUN_PERSIST 0x0100 -#define TUN_VNET_HDR 0x0200 -#define TUN_TAP_MQ 0x0400 - /* Ioctl defines */ #define TUNSETNOCSUM _IOW('T', 200, int) #define TUNSETDEBUG _IOW('T', 201, int) -- cgit v1.2.3 From e999d6ea2a4f313a5bba514b08f6f01b0c0072a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 18 Nov 2014 17:38:08 +0200 Subject: tun: add VNET_LE flag virtio 1.0 modified virtio net header format, making all fields little endian. Users can tweak header format before submitting it to tun, but this means more data copies where none were necessary. And if the iovec is in RO memory, this means we might need to split iovec also means we might in theory overflow iovec max size. This patch adds a simpler way for applications to handle this, using new "little endian" flag in tun. As a result, tun simply byte-swaps header fields as appropriate. This is a NOP on LE architectures. Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/if_tun.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 277a2607d166..18b2403982f9 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -57,6 +57,7 @@ #define IFF_ONE_QUEUE 0x2000 #define IFF_VNET_HDR 0x4000 #define IFF_TUN_EXCL 0x8000 +#define IFF_VNET_LE 0x10000 #define IFF_MULTI_QUEUE 0x0100 #define IFF_ATTACH_QUEUE 0x0200 #define IFF_DETACH_QUEUE 0x0400 -- cgit v1.2.3 From 106d81f58a5e7f41e857b8805e9d899e87d7a874 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 23 Nov 2014 17:45:53 +0200 Subject: virtio_scsi: move to uapi Guests need to use virtio scsi API, so export it to uapi, nice to e.g. qemu and will help us remember this file affects ABI. Signed-off-by: Michael S. Tsirkin Acked-by: Paolo Bonzini --- include/linux/virtio_scsi.h | 164 --------------------------------------- include/uapi/linux/virtio_scsi.h | 164 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 164 deletions(-) delete mode 100644 include/linux/virtio_scsi.h create mode 100644 include/uapi/linux/virtio_scsi.h (limited to 'include/uapi') diff --git a/include/linux/virtio_scsi.h b/include/linux/virtio_scsi.h deleted file mode 100644 index af448649a975..000000000000 --- a/include/linux/virtio_scsi.h +++ /dev/null @@ -1,164 +0,0 @@ -/* - * This header is BSD licensed so anyone can use the definitions to implement - * compatible drivers/servers. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef _LINUX_VIRTIO_SCSI_H -#define _LINUX_VIRTIO_SCSI_H - -#include - -#define VIRTIO_SCSI_CDB_SIZE 32 -#define VIRTIO_SCSI_SENSE_SIZE 96 - -/* SCSI command request, followed by data-out */ -struct virtio_scsi_cmd_req { - u8 lun[8]; /* Logical Unit Number */ - __virtio64 tag; /* Command identifier */ - u8 task_attr; /* Task attribute */ - u8 prio; /* SAM command priority field */ - u8 crn; - u8 cdb[VIRTIO_SCSI_CDB_SIZE]; -} __packed; - -/* SCSI command request, followed by protection information */ -struct virtio_scsi_cmd_req_pi { - u8 lun[8]; /* Logical Unit Number */ - __virtio64 tag; /* Command identifier */ - u8 task_attr; /* Task attribute */ - u8 prio; /* SAM command priority field */ - u8 crn; - __virtio32 pi_bytesout; /* DataOUT PI Number of bytes */ - __virtio32 pi_bytesin; /* DataIN PI Number of bytes */ - u8 cdb[VIRTIO_SCSI_CDB_SIZE]; -} __packed; - -/* Response, followed by sense data and data-in */ -struct virtio_scsi_cmd_resp { - __virtio32 sense_len; /* Sense data length */ - __virtio32 resid; /* Residual bytes in data buffer */ - __virtio16 status_qualifier; /* Status qualifier */ - u8 status; /* Command completion status */ - u8 response; /* Response values */ - u8 sense[VIRTIO_SCSI_SENSE_SIZE]; -} __packed; - -/* Task Management Request */ -struct virtio_scsi_ctrl_tmf_req { - __virtio32 type; - __virtio32 subtype; - u8 lun[8]; - __virtio64 tag; -} __packed; - -struct virtio_scsi_ctrl_tmf_resp { - u8 response; -} __packed; - -/* Asynchronous notification query/subscription */ -struct virtio_scsi_ctrl_an_req { - __virtio32 type; - u8 lun[8]; - __virtio32 event_requested; -} __packed; - -struct virtio_scsi_ctrl_an_resp { - __virtio32 event_actual; - u8 response; -} __packed; - -struct virtio_scsi_event { - __virtio32 event; - u8 lun[8]; - __virtio32 reason; -} __packed; - -struct virtio_scsi_config { - u32 num_queues; - u32 seg_max; - u32 max_sectors; - u32 cmd_per_lun; - u32 event_info_size; - u32 sense_size; - u32 cdb_size; - u16 max_channel; - u16 max_target; - u32 max_lun; -} __packed; - -/* Feature Bits */ -#define VIRTIO_SCSI_F_INOUT 0 -#define VIRTIO_SCSI_F_HOTPLUG 1 -#define VIRTIO_SCSI_F_CHANGE 2 -#define VIRTIO_SCSI_F_T10_PI 3 - -/* Response codes */ -#define VIRTIO_SCSI_S_OK 0 -#define VIRTIO_SCSI_S_OVERRUN 1 -#define VIRTIO_SCSI_S_ABORTED 2 -#define VIRTIO_SCSI_S_BAD_TARGET 3 -#define VIRTIO_SCSI_S_RESET 4 -#define VIRTIO_SCSI_S_BUSY 5 -#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 -#define VIRTIO_SCSI_S_TARGET_FAILURE 7 -#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 -#define VIRTIO_SCSI_S_FAILURE 9 -#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 -#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 -#define VIRTIO_SCSI_S_INCORRECT_LUN 12 - -/* Controlq type codes. */ -#define VIRTIO_SCSI_T_TMF 0 -#define VIRTIO_SCSI_T_AN_QUERY 1 -#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 - -/* Valid TMF subtypes. */ -#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 -#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 -#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 -#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 -#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 -#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 -#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 -#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 - -/* Events. */ -#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 -#define VIRTIO_SCSI_T_NO_EVENT 0 -#define VIRTIO_SCSI_T_TRANSPORT_RESET 1 -#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 -#define VIRTIO_SCSI_T_PARAM_CHANGE 3 - -/* Reasons of transport reset event */ -#define VIRTIO_SCSI_EVT_RESET_HARD 0 -#define VIRTIO_SCSI_EVT_RESET_RESCAN 1 -#define VIRTIO_SCSI_EVT_RESET_REMOVED 2 - -#define VIRTIO_SCSI_S_SIMPLE 0 -#define VIRTIO_SCSI_S_ORDERED 1 -#define VIRTIO_SCSI_S_HEAD 2 -#define VIRTIO_SCSI_S_ACA 3 - - -#endif /* _LINUX_VIRTIO_SCSI_H */ diff --git a/include/uapi/linux/virtio_scsi.h b/include/uapi/linux/virtio_scsi.h new file mode 100644 index 000000000000..af448649a975 --- /dev/null +++ b/include/uapi/linux/virtio_scsi.h @@ -0,0 +1,164 @@ +/* + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _LINUX_VIRTIO_SCSI_H +#define _LINUX_VIRTIO_SCSI_H + +#include + +#define VIRTIO_SCSI_CDB_SIZE 32 +#define VIRTIO_SCSI_SENSE_SIZE 96 + +/* SCSI command request, followed by data-out */ +struct virtio_scsi_cmd_req { + u8 lun[8]; /* Logical Unit Number */ + __virtio64 tag; /* Command identifier */ + u8 task_attr; /* Task attribute */ + u8 prio; /* SAM command priority field */ + u8 crn; + u8 cdb[VIRTIO_SCSI_CDB_SIZE]; +} __packed; + +/* SCSI command request, followed by protection information */ +struct virtio_scsi_cmd_req_pi { + u8 lun[8]; /* Logical Unit Number */ + __virtio64 tag; /* Command identifier */ + u8 task_attr; /* Task attribute */ + u8 prio; /* SAM command priority field */ + u8 crn; + __virtio32 pi_bytesout; /* DataOUT PI Number of bytes */ + __virtio32 pi_bytesin; /* DataIN PI Number of bytes */ + u8 cdb[VIRTIO_SCSI_CDB_SIZE]; +} __packed; + +/* Response, followed by sense data and data-in */ +struct virtio_scsi_cmd_resp { + __virtio32 sense_len; /* Sense data length */ + __virtio32 resid; /* Residual bytes in data buffer */ + __virtio16 status_qualifier; /* Status qualifier */ + u8 status; /* Command completion status */ + u8 response; /* Response values */ + u8 sense[VIRTIO_SCSI_SENSE_SIZE]; +} __packed; + +/* Task Management Request */ +struct virtio_scsi_ctrl_tmf_req { + __virtio32 type; + __virtio32 subtype; + u8 lun[8]; + __virtio64 tag; +} __packed; + +struct virtio_scsi_ctrl_tmf_resp { + u8 response; +} __packed; + +/* Asynchronous notification query/subscription */ +struct virtio_scsi_ctrl_an_req { + __virtio32 type; + u8 lun[8]; + __virtio32 event_requested; +} __packed; + +struct virtio_scsi_ctrl_an_resp { + __virtio32 event_actual; + u8 response; +} __packed; + +struct virtio_scsi_event { + __virtio32 event; + u8 lun[8]; + __virtio32 reason; +} __packed; + +struct virtio_scsi_config { + u32 num_queues; + u32 seg_max; + u32 max_sectors; + u32 cmd_per_lun; + u32 event_info_size; + u32 sense_size; + u32 cdb_size; + u16 max_channel; + u16 max_target; + u32 max_lun; +} __packed; + +/* Feature Bits */ +#define VIRTIO_SCSI_F_INOUT 0 +#define VIRTIO_SCSI_F_HOTPLUG 1 +#define VIRTIO_SCSI_F_CHANGE 2 +#define VIRTIO_SCSI_F_T10_PI 3 + +/* Response codes */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* Controlq type codes. */ +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2 + +/* Valid TMF subtypes. */ +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* Events. */ +#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000 +#define VIRTIO_SCSI_T_NO_EVENT 0 +#define VIRTIO_SCSI_T_TRANSPORT_RESET 1 +#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2 +#define VIRTIO_SCSI_T_PARAM_CHANGE 3 + +/* Reasons of transport reset event */ +#define VIRTIO_SCSI_EVT_RESET_HARD 0 +#define VIRTIO_SCSI_EVT_RESET_RESCAN 1 +#define VIRTIO_SCSI_EVT_RESET_REMOVED 2 + +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + + +#endif /* _LINUX_VIRTIO_SCSI_H */ -- cgit v1.2.3 From fba7f020e80d59a90d6135b0965cd488cc86f695 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 23 Nov 2014 17:49:12 +0200 Subject: virtio_scsi: export to userspace Replace uXX by __uXX and _packed by __attribute((packed)) as seems to be the norm for userspace headers. Signed-off-by: Michael S. Tsirkin Acked-by: Paolo Bonzini --- include/uapi/linux/Kbuild | 1 + include/uapi/linux/virtio_scsi.h | 74 ++++++++++++++++++++-------------------- 2 files changed, 38 insertions(+), 37 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 615f96a6022a..2e8fd10f0299 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -428,6 +428,7 @@ header-y += virtio_net.h header-y += virtio_pci.h header-y += virtio_ring.h header-y += virtio_rng.h +header-y += virtio_scsi.h header-y += vm_sockets.h header-y += vt.h header-y += wait.h diff --git a/include/uapi/linux/virtio_scsi.h b/include/uapi/linux/virtio_scsi.h index af448649a975..42b9370771b0 100644 --- a/include/uapi/linux/virtio_scsi.h +++ b/include/uapi/linux/virtio_scsi.h @@ -34,78 +34,78 @@ /* SCSI command request, followed by data-out */ struct virtio_scsi_cmd_req { - u8 lun[8]; /* Logical Unit Number */ + __u8 lun[8]; /* Logical Unit Number */ __virtio64 tag; /* Command identifier */ - u8 task_attr; /* Task attribute */ - u8 prio; /* SAM command priority field */ - u8 crn; - u8 cdb[VIRTIO_SCSI_CDB_SIZE]; -} __packed; + __u8 task_attr; /* Task attribute */ + __u8 prio; /* SAM command priority field */ + __u8 crn; + __u8 cdb[VIRTIO_SCSI_CDB_SIZE]; +} __attribute__((packed)); /* SCSI command request, followed by protection information */ struct virtio_scsi_cmd_req_pi { - u8 lun[8]; /* Logical Unit Number */ + __u8 lun[8]; /* Logical Unit Number */ __virtio64 tag; /* Command identifier */ - u8 task_attr; /* Task attribute */ - u8 prio; /* SAM command priority field */ - u8 crn; + __u8 task_attr; /* Task attribute */ + __u8 prio; /* SAM command priority field */ + __u8 crn; __virtio32 pi_bytesout; /* DataOUT PI Number of bytes */ __virtio32 pi_bytesin; /* DataIN PI Number of bytes */ - u8 cdb[VIRTIO_SCSI_CDB_SIZE]; -} __packed; + __u8 cdb[VIRTIO_SCSI_CDB_SIZE]; +} __attribute__((packed)); /* Response, followed by sense data and data-in */ struct virtio_scsi_cmd_resp { __virtio32 sense_len; /* Sense data length */ __virtio32 resid; /* Residual bytes in data buffer */ __virtio16 status_qualifier; /* Status qualifier */ - u8 status; /* Command completion status */ - u8 response; /* Response values */ - u8 sense[VIRTIO_SCSI_SENSE_SIZE]; -} __packed; + __u8 status; /* Command completion status */ + __u8 response; /* Response values */ + __u8 sense[VIRTIO_SCSI_SENSE_SIZE]; +} __attribute__((packed)); /* Task Management Request */ struct virtio_scsi_ctrl_tmf_req { __virtio32 type; __virtio32 subtype; - u8 lun[8]; + __u8 lun[8]; __virtio64 tag; -} __packed; +} __attribute__((packed)); struct virtio_scsi_ctrl_tmf_resp { - u8 response; -} __packed; + __u8 response; +} __attribute__((packed)); /* Asynchronous notification query/subscription */ struct virtio_scsi_ctrl_an_req { __virtio32 type; - u8 lun[8]; + __u8 lun[8]; __virtio32 event_requested; -} __packed; +} __attribute__((packed)); struct virtio_scsi_ctrl_an_resp { __virtio32 event_actual; - u8 response; -} __packed; + __u8 response; +} __attribute__((packed)); struct virtio_scsi_event { __virtio32 event; - u8 lun[8]; + __u8 lun[8]; __virtio32 reason; -} __packed; +} __attribute__((packed)); struct virtio_scsi_config { - u32 num_queues; - u32 seg_max; - u32 max_sectors; - u32 cmd_per_lun; - u32 event_info_size; - u32 sense_size; - u32 cdb_size; - u16 max_channel; - u16 max_target; - u32 max_lun; -} __packed; + __u32 num_queues; + __u32 seg_max; + __u32 max_sectors; + __u32 cmd_per_lun; + __u32 event_info_size; + __u32 sense_size; + __u32 cdb_size; + __u16 max_channel; + __u16 max_target; + __u32 max_lun; +} __attribute__((packed)); /* Feature Bits */ #define VIRTIO_SCSI_F_INOUT 0 -- cgit v1.2.3 From 1f0f9106f92c7d49ec92baa6ac757aa8b0590eff Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 1 Dec 2014 13:17:40 +0200 Subject: virtio_console: virtio 1.0 support Pretty straight-forward, just use accessors for all fields. Signed-off-by: Michael S. Tsirkin --- drivers/char/virtio_console.c | 29 +++++++++++++++++------------ include/uapi/linux/virtio_console.h | 7 ++++--- 2 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 8d00aa7f60f0..775c89821364 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -566,9 +566,9 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, if (!use_multiport(portdev)) return 0; - cpkt.id = port_id; - cpkt.event = event; - cpkt.value = value; + cpkt.id = cpu_to_virtio32(portdev->vdev, port_id); + cpkt.event = cpu_to_virtio16(portdev->vdev, event); + cpkt.value = cpu_to_virtio16(portdev->vdev, value); vq = portdev->c_ovq; @@ -1602,7 +1602,8 @@ static void unplug_port(struct port *port) } /* Any private messages that the Host and Guest want to share */ -static void handle_control_message(struct ports_device *portdev, +static void handle_control_message(struct virtio_device *vdev, + struct ports_device *portdev, struct port_buffer *buf) { struct virtio_console_control *cpkt; @@ -1612,15 +1613,16 @@ static void handle_control_message(struct ports_device *portdev, cpkt = (struct virtio_console_control *)(buf->buf + buf->offset); - port = find_port_by_id(portdev, cpkt->id); - if (!port && cpkt->event != VIRTIO_CONSOLE_PORT_ADD) { + port = find_port_by_id(portdev, virtio32_to_cpu(vdev, cpkt->id)); + if (!port && + cpkt->event != cpu_to_virtio16(vdev, VIRTIO_CONSOLE_PORT_ADD)) { /* No valid header at start of buffer. Drop it. */ dev_dbg(&portdev->vdev->dev, "Invalid index %u in control packet\n", cpkt->id); return; } - switch (cpkt->event) { + switch (virtio16_to_cpu(vdev, cpkt->event)) { case VIRTIO_CONSOLE_PORT_ADD: if (port) { dev_dbg(&portdev->vdev->dev, @@ -1628,13 +1630,15 @@ static void handle_control_message(struct ports_device *portdev, send_control_msg(port, VIRTIO_CONSOLE_PORT_READY, 1); break; } - if (cpkt->id >= portdev->config.max_nr_ports) { + if (virtio32_to_cpu(vdev, cpkt->id) >= + portdev->config.max_nr_ports) { dev_warn(&portdev->vdev->dev, - "Request for adding port with out-of-bound id %u, max. supported id: %u\n", + "Request for adding port with " + "out-of-bound id %u, max. supported id: %u\n", cpkt->id, portdev->config.max_nr_ports - 1); break; } - add_port(portdev, cpkt->id); + add_port(portdev, virtio32_to_cpu(vdev, cpkt->id)); break; case VIRTIO_CONSOLE_PORT_REMOVE: unplug_port(port); @@ -1670,7 +1674,7 @@ static void handle_control_message(struct ports_device *portdev, break; } case VIRTIO_CONSOLE_PORT_OPEN: - port->host_connected = cpkt->value; + port->host_connected = virtio16_to_cpu(vdev, cpkt->value); wake_up_interruptible(&port->waitqueue); /* * If the host port got closed and the host had any @@ -1752,7 +1756,7 @@ static void control_work_handler(struct work_struct *work) buf->len = len; buf->offset = 0; - handle_control_message(portdev, buf); + handle_control_message(vq->vdev, portdev, buf); spin_lock(&portdev->c_ivq_lock); if (add_inbuf(portdev->c_ivq, buf) < 0) { @@ -2126,6 +2130,7 @@ static struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_CONSOLE_F_SIZE, VIRTIO_CONSOLE_F_MULTIPORT, + VIRTIO_F_VERSION_1, }; static struct virtio_device_id rproc_serial_id_table[] = { diff --git a/include/uapi/linux/virtio_console.h b/include/uapi/linux/virtio_console.h index ba260dd0b33a..b7fb108c9310 100644 --- a/include/uapi/linux/virtio_console.h +++ b/include/uapi/linux/virtio_console.h @@ -32,6 +32,7 @@ #ifndef _UAPI_LINUX_VIRTIO_CONSOLE_H #define _UAPI_LINUX_VIRTIO_CONSOLE_H #include +#include #include #include @@ -58,9 +59,9 @@ struct virtio_console_config { * particular port. */ struct virtio_console_control { - __u32 id; /* Port number */ - __u16 event; /* The kind of control event (see below) */ - __u16 value; /* Extra information for the key */ + __virtio32 id; /* Port number */ + __virtio16 event; /* The kind of control event (see below) */ + __virtio16 value; /* Extra information for the key */ }; /* Some events for control messages */ -- cgit v1.2.3 From 747ae34a6ef681fbd993be214d8c0a30bd4a2fda Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 1 Dec 2014 15:52:40 +0200 Subject: virtio: make VIRTIO_F_VERSION_1 a transport bit Activate VIRTIO_F_VERSION_1 automatically unless legacy_only is set. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 6 +++++- drivers/virtio/virtio_ring.c | 2 ++ include/uapi/linux/virtio_config.h | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index f9ad99c8b352..fa6b75db5f1f 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -197,7 +197,11 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } - if (driver_features & device_features & (1ULL << VIRTIO_F_VERSION_1)) + /* Detect legacy-only drivers and disable VIRTIO_F_VERSION_1. */ + if (drv->legacy_only) + device_features &= ~(1ULL << VIRTIO_F_VERSION_1); + + if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else dev->features = driver_features_legacy & device_features; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 55532a43ead3..00ec6b3f96b2 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -780,6 +780,8 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_RING_F_EVENT_IDX: break; + case VIRTIO_F_VERSION_1: + break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 4d05671fbaec..a6d0cdeaacd4 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -43,11 +43,11 @@ /* We've given up on this device. */ #define VIRTIO_CONFIG_S_FAILED 0x80 -/* Some virtio feature bits (currently bits 28 through 31) are reserved for the +/* Some virtio feature bits (currently bits 28 through 32) are reserved for the * transport being used (eg. virtio_ring), the rest are per-device feature * bits. */ #define VIRTIO_TRANSPORT_F_START 28 -#define VIRTIO_TRANSPORT_F_END 32 +#define VIRTIO_TRANSPORT_F_END 33 /* Do we get callbacks when the ring is completely used, even if we've * suppressed them? */ -- cgit v1.2.3 From 5f4d8d97f5d979c6a62dcc45619b06d34311937c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 3 Dec 2014 09:33:17 -0800 Subject: tc_act: export uapi header file This file is used by iproute2 and should be exported. Signed-off-by: Stephen Hemminger Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild index 56f121605c99..b057da2b87a4 100644 --- a/include/uapi/linux/tc_act/Kbuild +++ b/include/uapi/linux/tc_act/Kbuild @@ -7,3 +7,4 @@ header-y += tc_mirred.h header-y += tc_nat.h header-y += tc_pedit.h header-y += tc_skbedit.h +header-y += tc_vlan.h -- cgit v1.2.3 From 6e3a8a937c2f86ee0b2d354808fc026a143b4518 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Dec 2014 16:13:23 -0800 Subject: tcp_cubic: add SNMP counters to track how effective is Hystart When deploying FQ pacing, one thing we noticed is that CUBIC Hystart triggers too soon. Having SNMP counters to have an idea of how often the various Hystart methods trigger is useful prior to any modifications. This patch adds SNMP counters tracking, how many time "ack train" or "Delay" based Hystart triggers, and cumulative sum of cwnd at the time Hystart decided to end SS (Slow Start) myhost:~# nstat -a | grep Hystart TcpExtTCPHystartTrainDetect 9 0.0 TcpExtTCPHystartTrainCwnd 20650 0.0 TcpExtTCPHystartDelayDetect 10 0.0 TcpExtTCPHystartDelayCwnd 360 0.0 -> Train detection was triggered 9 times, and average cwnd was 20650/9=2294, Delay detection was triggered 10 times and average cwnd was 36 Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 4 ++++ net/ipv4/proc.c | 4 ++++ net/ipv4/tcp_cubic.c | 31 ++++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 30f541b32895..b22224100011 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -266,6 +266,10 @@ enum LINUX_MIB_TCPWANTZEROWINDOWADV, /* TCPWantZeroWindowAdv */ LINUX_MIB_TCPSYNRETRANS, /* TCPSynRetrans */ LINUX_MIB_TCPORIGDATASENT, /* TCPOrigDataSent */ + LINUX_MIB_TCPHYSTARTTRAINDETECT, /* TCPHystartTrainDetect */ + LINUX_MIB_TCPHYSTARTTRAINCWND, /* TCPHystartTrainCwnd */ + LINUX_MIB_TCPHYSTARTDELAYDETECT, /* TCPHystartDelayDetect */ + LINUX_MIB_TCPHYSTARTDELAYCWND, /* TCPHystartDelayCwnd */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 6513ade8d6dc..8f9cd200ce20 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -288,6 +288,10 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV), SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS), SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT), + SNMP_MIB_ITEM("TCPHystartTrainDetect", LINUX_MIB_TCPHYSTARTTRAINDETECT), + SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), + SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), + SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 20de0118c98e..c1d07c7ed03d 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -363,16 +363,28 @@ static void hystart_update(struct sock *sk, u32 delay) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (!(ca->found & hystart_detect)) { + if (ca->found & hystart_detect) + return; + + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } + } + if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { if (ca->curr_rtt == 0 || ca->curr_rtt > delay) @@ -381,15 +393,16 @@ static void hystart_update(struct sock *sk, u32 delay) ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + - HYSTART_DELAY_THRESH(ca->delay_min>>4)) + HYSTART_DELAY_THRESH(ca->delay_min>>4)) { ca->found |= HYSTART_DELAY; + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS_BH(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); + tp->snd_ssthresh = tp->snd_cwnd; + } } - /* - * Either one of two conditions are met, - * we exit from slow start immediately. - */ - if (ca->found & hystart_detect) - tp->snd_ssthresh = tp->snd_cwnd; } } -- cgit v1.2.3 From fc0bdbbc67c9f7307d41f415abb1165778fe803f Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 8 Dec 2014 14:04:19 -0800 Subject: bridge: new mode flag to indicate mode 'undefined' This patch adds mode BRIDGE_MODE_UNDEF for cases where mode is not needed. Signed-off-by: Roopa Prabhu Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 296a556454e3..05f08695c6c9 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -106,6 +106,7 @@ struct __fdb_entry { #define BRIDGE_MODE_VEB 0 /* Default loopback mode */ #define BRIDGE_MODE_VEPA 1 /* 802.1Qbg defined VEPA mode */ #define BRIDGE_MODE_SWDEV 2 /* Full switch device offload */ +#define BRIDGE_MODE_UNDEF 0xFFFF /* mode undefined */ /* Bridge management nested attributes * [IFLA_AF_SPEC] = { -- cgit v1.2.3 From 4a5fdfe8b3669070112105573f2d7c487ec7ad0d Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 8 Dec 2014 14:04:21 -0800 Subject: bridge: remove mode BRIDGE_MODE_SWDEV This patch removes bridge mode swdev. Users can use BRIDGE_FLAGS_SELF to indicate swdev offload if needed. Signed-off-by: Roopa Prabhu Signed-off-by: Scott Feldman Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 05f08695c6c9..b03ee8f62d3c 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -105,7 +105,6 @@ struct __fdb_entry { #define BRIDGE_MODE_VEB 0 /* Default loopback mode */ #define BRIDGE_MODE_VEPA 1 /* 802.1Qbg defined VEPA mode */ -#define BRIDGE_MODE_SWDEV 2 /* Full switch device offload */ #define BRIDGE_MODE_UNDEF 0xFFFF /* mode undefined */ /* Bridge management nested attributes -- cgit v1.2.3 From 8985f4ac1c42bd25799f294f4e87fa73064673c7 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 9 Dec 2014 00:10:49 +0900 Subject: ALSA: oxfw: Add hwdep interface This interface is designed for mixer/control application. By using this interface, an application can get information about firewire node, can lock/unlock kernel streaming and can get notification at starting/stopping kernel streaming. Signed-off-by: Takashi Sakamoto Acked-by: Clemens Ladisch Signed-off-by: Takashi Iwai --- include/uapi/sound/asound.h | 3 +- include/uapi/sound/firewire.h | 3 +- sound/firewire/Kconfig | 1 + sound/firewire/oxfw/Makefile | 2 +- sound/firewire/oxfw/oxfw-hwdep.c | 190 ++++++++++++++++++++++++++++++++++++++ sound/firewire/oxfw/oxfw-midi.c | 16 ++++ sound/firewire/oxfw/oxfw-pcm.c | 12 ++- sound/firewire/oxfw/oxfw-stream.c | 39 ++++++++ sound/firewire/oxfw/oxfw.c | 5 + sound/firewire/oxfw/oxfw.h | 13 +++ 10 files changed, 280 insertions(+), 4 deletions(-) create mode 100644 sound/firewire/oxfw/oxfw-hwdep.c (limited to 'include/uapi') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 941d32f007dc..1f23cd635957 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -96,9 +96,10 @@ enum { SNDRV_HWDEP_IFACE_FW_DICE, /* TC DICE FireWire device */ SNDRV_HWDEP_IFACE_FW_FIREWORKS, /* Echo Audio Fireworks based device */ SNDRV_HWDEP_IFACE_FW_BEBOB, /* BridgeCo BeBoB based device */ + SNDRV_HWDEP_IFACE_FW_OXFW, /* Oxford OXFW970/971 based device */ /* Don't forget to change the following: */ - SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_BEBOB + SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_OXFW }; struct snd_hwdep_info { diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h index af4bd136c75d..49122df3b56b 100644 --- a/include/uapi/sound/firewire.h +++ b/include/uapi/sound/firewire.h @@ -55,7 +55,8 @@ union snd_firewire_event { #define SNDRV_FIREWIRE_TYPE_DICE 1 #define SNDRV_FIREWIRE_TYPE_FIREWORKS 2 #define SNDRV_FIREWIRE_TYPE_BEBOB 3 -/* AV/C, RME, MOTU, ... */ +#define SNDRV_FIREWIRE_TYPE_OXFW 4 +/* RME, MOTU, ... */ struct snd_firewire_get_info { unsigned int type; /* SNDRV_FIREWIRE_TYPE_xxx */ diff --git a/sound/firewire/Kconfig b/sound/firewire/Kconfig index 6364e5b90a00..ecec547782b2 100644 --- a/sound/firewire/Kconfig +++ b/sound/firewire/Kconfig @@ -26,6 +26,7 @@ config SND_DICE config SND_OXFW tristate "Oxford Semiconductor FW970/971 chipset support" select SND_FIREWIRE_LIB + select SND_HWDEP help Say Y here to include support for FireWire devices based on Oxford Semiconductor FW970/971 chipset. diff --git a/sound/firewire/oxfw/Makefile b/sound/firewire/oxfw/Makefile index 3904a302d48e..a926850864f6 100644 --- a/sound/firewire/oxfw/Makefile +++ b/sound/firewire/oxfw/Makefile @@ -1,3 +1,3 @@ snd-oxfw-objs := oxfw-command.o oxfw-stream.o oxfw-control.o oxfw-pcm.o \ - oxfw-proc.o oxfw-midi.o oxfw.o + oxfw-proc.o oxfw-midi.o oxfw-hwdep.o oxfw.o obj-m += snd-oxfw.o diff --git a/sound/firewire/oxfw/oxfw-hwdep.c b/sound/firewire/oxfw/oxfw-hwdep.c new file mode 100644 index 000000000000..ff2687ad0460 --- /dev/null +++ b/sound/firewire/oxfw/oxfw-hwdep.c @@ -0,0 +1,190 @@ +/* + * oxfw_hwdep.c - a part of driver for OXFW970/971 based devices + * + * Copyright (c) 2014 Takashi Sakamoto + * + * Licensed under the terms of the GNU General Public License, version 2. + */ + +/* + * This codes give three functionality. + * + * 1.get firewire node information + * 2.get notification about starting/stopping stream + * 3.lock/unlock stream + */ + +#include "oxfw.h" + +static long hwdep_read(struct snd_hwdep *hwdep, char __user *buf, long count, + loff_t *offset) +{ + struct snd_oxfw *oxfw = hwdep->private_data; + DEFINE_WAIT(wait); + union snd_firewire_event event; + + spin_lock_irq(&oxfw->lock); + + while (!oxfw->dev_lock_changed) { + prepare_to_wait(&oxfw->hwdep_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock_irq(&oxfw->lock); + schedule(); + finish_wait(&oxfw->hwdep_wait, &wait); + if (signal_pending(current)) + return -ERESTARTSYS; + spin_lock_irq(&oxfw->lock); + } + + memset(&event, 0, sizeof(event)); + if (oxfw->dev_lock_changed) { + event.lock_status.type = SNDRV_FIREWIRE_EVENT_LOCK_STATUS; + event.lock_status.status = (oxfw->dev_lock_count > 0); + oxfw->dev_lock_changed = false; + + count = min_t(long, count, sizeof(event.lock_status)); + } + + spin_unlock_irq(&oxfw->lock); + + if (copy_to_user(buf, &event, count)) + return -EFAULT; + + return count; +} + +static unsigned int hwdep_poll(struct snd_hwdep *hwdep, struct file *file, + poll_table *wait) +{ + struct snd_oxfw *oxfw = hwdep->private_data; + unsigned int events; + + poll_wait(file, &oxfw->hwdep_wait, wait); + + spin_lock_irq(&oxfw->lock); + if (oxfw->dev_lock_changed) + events = POLLIN | POLLRDNORM; + else + events = 0; + spin_unlock_irq(&oxfw->lock); + + return events; +} + +static int hwdep_get_info(struct snd_oxfw *oxfw, void __user *arg) +{ + struct fw_device *dev = fw_parent_device(oxfw->unit); + struct snd_firewire_get_info info; + + memset(&info, 0, sizeof(info)); + info.type = SNDRV_FIREWIRE_TYPE_OXFW; + info.card = dev->card->index; + *(__be32 *)&info.guid[0] = cpu_to_be32(dev->config_rom[3]); + *(__be32 *)&info.guid[4] = cpu_to_be32(dev->config_rom[4]); + strlcpy(info.device_name, dev_name(&dev->device), + sizeof(info.device_name)); + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int hwdep_lock(struct snd_oxfw *oxfw) +{ + int err; + + spin_lock_irq(&oxfw->lock); + + if (oxfw->dev_lock_count == 0) { + oxfw->dev_lock_count = -1; + err = 0; + } else { + err = -EBUSY; + } + + spin_unlock_irq(&oxfw->lock); + + return err; +} + +static int hwdep_unlock(struct snd_oxfw *oxfw) +{ + int err; + + spin_lock_irq(&oxfw->lock); + + if (oxfw->dev_lock_count == -1) { + oxfw->dev_lock_count = 0; + err = 0; + } else { + err = -EBADFD; + } + + spin_unlock_irq(&oxfw->lock); + + return err; +} + +static int hwdep_release(struct snd_hwdep *hwdep, struct file *file) +{ + struct snd_oxfw *oxfw = hwdep->private_data; + + spin_lock_irq(&oxfw->lock); + if (oxfw->dev_lock_count == -1) + oxfw->dev_lock_count = 0; + spin_unlock_irq(&oxfw->lock); + + return 0; +} + +static int hwdep_ioctl(struct snd_hwdep *hwdep, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct snd_oxfw *oxfw = hwdep->private_data; + + switch (cmd) { + case SNDRV_FIREWIRE_IOCTL_GET_INFO: + return hwdep_get_info(oxfw, (void __user *)arg); + case SNDRV_FIREWIRE_IOCTL_LOCK: + return hwdep_lock(oxfw); + case SNDRV_FIREWIRE_IOCTL_UNLOCK: + return hwdep_unlock(oxfw); + default: + return -ENOIOCTLCMD; + } +} + +#ifdef CONFIG_COMPAT +static int hwdep_compat_ioctl(struct snd_hwdep *hwdep, struct file *file, + unsigned int cmd, unsigned long arg) +{ + return hwdep_ioctl(hwdep, file, cmd, + (unsigned long)compat_ptr(arg)); +} +#else +#define hwdep_compat_ioctl NULL +#endif + +int snd_oxfw_create_hwdep(struct snd_oxfw *oxfw) +{ + static const struct snd_hwdep_ops hwdep_ops = { + .read = hwdep_read, + .release = hwdep_release, + .poll = hwdep_poll, + .ioctl = hwdep_ioctl, + .ioctl_compat = hwdep_compat_ioctl, + }; + struct snd_hwdep *hwdep; + int err; + + err = snd_hwdep_new(oxfw->card, oxfw->card->driver, 0, &hwdep); + if (err < 0) + goto end; + strcpy(hwdep->name, oxfw->card->driver); + hwdep->iface = SNDRV_HWDEP_IFACE_FW_OXFW; + hwdep->ops = hwdep_ops; + hwdep->private_data = oxfw; + hwdep->exclusive = true; +end: + return err; +} diff --git a/sound/firewire/oxfw/oxfw-midi.c b/sound/firewire/oxfw/oxfw-midi.c index 334b11d1a422..540a30338516 100644 --- a/sound/firewire/oxfw/oxfw-midi.c +++ b/sound/firewire/oxfw/oxfw-midi.c @@ -13,6 +13,10 @@ static int midi_capture_open(struct snd_rawmidi_substream *substream) struct snd_oxfw *oxfw = substream->rmidi->private_data; int err; + err = snd_oxfw_stream_lock_try(oxfw); + if (err < 0) + return err; + mutex_lock(&oxfw->mutex); oxfw->capture_substreams++; @@ -20,6 +24,9 @@ static int midi_capture_open(struct snd_rawmidi_substream *substream) mutex_unlock(&oxfw->mutex); + if (err < 0) + snd_oxfw_stream_lock_release(oxfw); + return err; } @@ -28,6 +35,10 @@ static int midi_playback_open(struct snd_rawmidi_substream *substream) struct snd_oxfw *oxfw = substream->rmidi->private_data; int err; + err = snd_oxfw_stream_lock_try(oxfw); + if (err < 0) + return err; + mutex_lock(&oxfw->mutex); oxfw->playback_substreams++; @@ -35,6 +46,9 @@ static int midi_playback_open(struct snd_rawmidi_substream *substream) mutex_unlock(&oxfw->mutex); + if (err < 0) + snd_oxfw_stream_lock_release(oxfw); + return err; } @@ -49,6 +63,7 @@ static int midi_capture_close(struct snd_rawmidi_substream *substream) mutex_unlock(&oxfw->mutex); + snd_oxfw_stream_lock_release(oxfw); return 0; } @@ -63,6 +78,7 @@ static int midi_playback_close(struct snd_rawmidi_substream *substream) mutex_unlock(&oxfw->mutex); + snd_oxfw_stream_lock_release(oxfw); return 0; } diff --git a/sound/firewire/oxfw/oxfw-pcm.c b/sound/firewire/oxfw/oxfw-pcm.c index e84fc9c4bfd1..9bc556b15a92 100644 --- a/sound/firewire/oxfw/oxfw-pcm.c +++ b/sound/firewire/oxfw/oxfw-pcm.c @@ -192,10 +192,14 @@ static int pcm_open(struct snd_pcm_substream *substream) struct snd_oxfw *oxfw = substream->private_data; int err; - err = init_hw_params(oxfw, substream); + err = snd_oxfw_stream_lock_try(oxfw); if (err < 0) goto end; + err = init_hw_params(oxfw, substream); + if (err < 0) + goto err_locked; + /* * When any PCM streams are already running, the available sampling * rate is limited at current value. @@ -210,10 +214,16 @@ static int pcm_open(struct snd_pcm_substream *substream) snd_pcm_set_sync(substream); end: return err; +err_locked: + snd_oxfw_stream_lock_release(oxfw); + return err; } static int pcm_close(struct snd_pcm_substream *substream) { + struct snd_oxfw *oxfw = substream->private_data; + + snd_oxfw_stream_lock_release(oxfw); return 0; } diff --git a/sound/firewire/oxfw/oxfw-stream.c b/sound/firewire/oxfw/oxfw-stream.c index a38b3c36faca..b77cf80f1678 100644 --- a/sound/firewire/oxfw/oxfw-stream.c +++ b/sound/firewire/oxfw/oxfw-stream.c @@ -644,3 +644,42 @@ int snd_oxfw_stream_discover(struct snd_oxfw *oxfw) end: return err; } + +void snd_oxfw_stream_lock_changed(struct snd_oxfw *oxfw) +{ + oxfw->dev_lock_changed = true; + wake_up(&oxfw->hwdep_wait); +} + +int snd_oxfw_stream_lock_try(struct snd_oxfw *oxfw) +{ + int err; + + spin_lock_irq(&oxfw->lock); + + /* user land lock this */ + if (oxfw->dev_lock_count < 0) { + err = -EBUSY; + goto end; + } + + /* this is the first time */ + if (oxfw->dev_lock_count++ == 0) + snd_oxfw_stream_lock_changed(oxfw); + err = 0; +end: + spin_unlock_irq(&oxfw->lock); + return err; +} + +void snd_oxfw_stream_lock_release(struct snd_oxfw *oxfw) +{ + spin_lock_irq(&oxfw->lock); + + if (WARN_ON(oxfw->dev_lock_count <= 0)) + goto end; + if (--oxfw->dev_lock_count == 0) + snd_oxfw_stream_lock_changed(oxfw); +end: + spin_unlock_irq(&oxfw->lock); +} diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index 9cfbfb168dac..cf1d0b55e827 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -139,6 +139,7 @@ static int oxfw_probe(struct fw_unit *unit, oxfw->unit = unit; oxfw->device_info = (const struct device_info *)id->driver_data; spin_lock_init(&oxfw->lock); + init_waitqueue_head(&oxfw->hwdep_wait); err = snd_oxfw_stream_discover(oxfw); if (err < 0) @@ -164,6 +165,10 @@ static int oxfw_probe(struct fw_unit *unit, if (err < 0) goto error; + err = snd_oxfw_create_hwdep(oxfw); + if (err < 0) + goto error; + err = snd_oxfw_stream_init_simplex(oxfw, &oxfw->rx_stream); if (err < 0) goto error; diff --git a/sound/firewire/oxfw/oxfw.h b/sound/firewire/oxfw/oxfw.h index 83a54fc73a11..cace5ad4fe76 100644 --- a/sound/firewire/oxfw/oxfw.h +++ b/sound/firewire/oxfw/oxfw.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,8 @@ #include #include #include +#include +#include #include "../lib.h" #include "../fcp.h" @@ -64,6 +67,10 @@ struct snd_oxfw { s16 volume[6]; s16 volume_min; s16 volume_max; + + int dev_lock_count; + bool dev_lock_changed; + wait_queue_head_t hwdep_wait; }; /* @@ -124,6 +131,10 @@ int snd_oxfw_stream_get_current_formation(struct snd_oxfw *oxfw, int snd_oxfw_stream_discover(struct snd_oxfw *oxfw); +void snd_oxfw_stream_lock_changed(struct snd_oxfw *oxfw); +int snd_oxfw_stream_lock_try(struct snd_oxfw *oxfw); +void snd_oxfw_stream_lock_release(struct snd_oxfw *oxfw); + int snd_oxfw_create_pcm(struct snd_oxfw *oxfw); int snd_oxfw_create_mixer(struct snd_oxfw *oxfw); @@ -131,3 +142,5 @@ int snd_oxfw_create_mixer(struct snd_oxfw *oxfw); void snd_oxfw_proc_init(struct snd_oxfw *oxfw); int snd_oxfw_create_midi(struct snd_oxfw *oxfw); + +int snd_oxfw_create_hwdep(struct snd_oxfw *oxfw); -- cgit v1.2.3 From 9e3961a0979817c612b10b2da4f3045ec9faa779 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 10 Dec 2014 15:45:50 -0800 Subject: kernel: add panic_on_warn There have been several times where I have had to rebuild a kernel to cause a panic when hitting a WARN() in the code in order to get a crash dump from a system. Sometimes this is easy to do, other times (such as in the case of a remote admin) it is not trivial to send new images to the user. A much easier method would be a switch to change the WARN() over to a panic. This makes debugging easier in that I can now test the actual image the WARN() was seen on and I do not have to engage in remote debugging. This patch adds a panic_on_warn kernel parameter and /proc/sys/kernel/panic_on_warn calls panic() in the warn_slowpath_common() path. The function will still print out the location of the warning. An example of the panic_on_warn output: The first line below is from the WARN_ON() to output the WARN_ON()'s location. After that the panic() output is displayed. WARNING: CPU: 30 PID: 11698 at /home/prarit/dummy_module/dummy-module.c:25 init_dummy+0x1f/0x30 [dummy_module]() Kernel panic - not syncing: panic_on_warn set ... CPU: 30 PID: 11698 Comm: insmod Tainted: G W OE 3.17.0+ #57 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.00.29.D696.1311111329 11/11/2013 0000000000000000 000000008e3f87df ffff88080f093c38 ffffffff81665190 0000000000000000 ffffffff818aea3d ffff88080f093cb8 ffffffff8165e2ec ffffffff00000008 ffff88080f093cc8 ffff88080f093c68 000000008e3f87df Call Trace: [] dump_stack+0x46/0x58 [] panic+0xd0/0x204 [] ? init_dummy+0x1f/0x30 [dummy_module] [] warn_slowpath_common+0xd0/0xd0 [] ? dummy_greetings+0x40/0x40 [dummy_module] [] warn_slowpath_null+0x1a/0x20 [] init_dummy+0x1f/0x30 [dummy_module] [] do_one_initcall+0xd4/0x210 [] ? __vunmap+0xc2/0x110 [] load_module+0x16a9/0x1b30 [] ? store_uevent+0x70/0x70 [] ? copy_module_from_fd.isra.44+0x129/0x180 [] SyS_finit_module+0xa6/0xd0 [] system_call_fastpath+0x12/0x17 Successfully tested by me. hpa said: There is another very valid use for this: many operators would rather a machine shuts down than being potentially compromised either functionally or security-wise. Signed-off-by: Prarit Bhargava Cc: Jonathan Corbet Cc: Rusty Russell Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Masami Hiramatsu Acked-by: Yasuaki Ishimatsu Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kdump/kdump.txt | 7 +++++++ Documentation/kernel-parameters.txt | 3 +++ Documentation/sysctl/kernel.txt | 40 ++++++++++++++++++++++++------------- include/linux/kernel.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/panic.c | 13 ++++++++++++ kernel/sysctl.c | 9 +++++++++ kernel/sysctl_binary.c | 1 + 8 files changed, 61 insertions(+), 14 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 6c0b9f27e465..bc4bd5a44b88 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -471,6 +471,13 @@ format. Crash is available on Dave Anderson's site at the following URL: http://people.redhat.com/~anderson/ +Trigger Kdump on WARN() +======================= + +The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This +will cause a kdump to occur at the panic() call. In cases where a user wants +to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 +to achieve the same behaviour. Contact ======= diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 838f3776c924..d6eb3636fe5a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2509,6 +2509,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. timeout < 0: reboot immediately Format: + panic_on_warn panic() instead of WARN(). Useful to cause kdump + on a WARN(). + crash_kexec_post_notifiers Run kdump after running panic-notifiers and dumping kmsg. This only for the users who doubt kdump always diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 57baff5bdb80..b5d0c8501a18 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -54,8 +54,9 @@ show up in /proc/sys/kernel: - overflowuid - panic - panic_on_oops -- panic_on_unrecovered_nmi - panic_on_stackoverflow +- panic_on_unrecovered_nmi +- panic_on_warn - pid_max - powersave-nap [ PPC only ] - printk @@ -527,19 +528,6 @@ the recommended setting is 60. ============================================================== -panic_on_unrecovered_nmi: - -The default Linux behaviour on an NMI of either memory or unknown is -to continue operation. For many environments such as scientific -computing it is preferable that the box is taken out and the error -dealt with than an uncorrected parity/ECC error get propagated. - -A small number of systems do generate NMI's for bizarre random reasons -such as power management so the default is off. That sysctl works like -the existing panic controls already in that directory. - -============================================================== - panic_on_oops: Controls the kernel's behaviour when an oops or BUG is encountered. @@ -563,6 +551,30 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. ============================================================== +panic_on_unrecovered_nmi: + +The default Linux behaviour on an NMI of either memory or unknown is +to continue operation. For many environments such as scientific +computing it is preferable that the box is taken out and the error +dealt with than an uncorrected parity/ECC error get propagated. + +A small number of systems do generate NMI's for bizarre random reasons +such as power management so the default is off. That sysctl works like +the existing panic controls already in that directory. + +============================================================== + +panic_on_warn: + +Calls panic() in the WARN() path when set to 1. This is useful to avoid +a kernel rebuild when attempting to kdump at the location of a WARN(). + +0: only WARN(), default behaviour. + +1: call panic() after printing out WARN() location. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 446d76a87ba1..233ea8107038 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -427,6 +427,7 @@ extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; +extern int panic_on_warn; extern int sysctl_panic_on_stackoverflow; /* * Only to be used by arch init code. If the user over-wrote the default diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 43aaba1cc037..0956373b56db 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -153,6 +153,7 @@ enum KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */ KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */ }; diff --git a/kernel/panic.c b/kernel/panic.c index cf80672b7924..4d8d6f906dec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,6 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); static bool crash_kexec_post_notifiers; +int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, if (args) vprintk(args->fmt, args->args); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + panic("panic_on_warn set ...\n"); + } + print_modules(); dump_stack(); print_oops_end_marker(); @@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +core_param(panic_on_warn, panic_on_warn, int, 0644); static int __init setup_crash_kexec_post_notifiers(char *s) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 15f2511a1b7c..7c54ff79afd7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1104,6 +1104,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 9a4f750a2963..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, {} }; -- cgit v1.2.3 From e149ed2b805fefdccf7ccdfc19eca22fdd4514ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Nov 2014 10:57:28 -0400 Subject: take the targets of /proc/*/ns/* symlinks to separate fs New pseudo-filesystem: nsfs. Targets of /proc/*/ns/* live there now. It's not mountable (not even registered, so it's not in /proc/filesystems, etc.). Files on it *are* bindable - we explicitly permit that in do_loopback(). This stuff lives in fs/nsfs.c now; proc_ns_fget() moved there as well. get_proc_ns() is a macro now (it's simply returning ->i_private; would have been an inline, if not for header ordering headache). proc_ns_inode() is an ex-parrot. The interface used in procfs is ns_get_path(path, task, ops) and ns_get_name(buf, size, task, ops). Dentries and inodes are never hashed; a non-counting reference to dentry is stashed in ns_common (removed by ->d_prune()) and reused by ns_get_path() if present. See ns_get_path()/ns_prune_dentry/nsfs_evict() for details of that mechanism. As the result, proc_ns_follow_link() has stopped poking in nd->path.mnt; it does nd_jump_link() on a consistent pair it gets from ns_get_path(). Signed-off-by: Al Viro --- fs/Makefile | 2 +- fs/internal.h | 5 ++ fs/namespace.c | 9 ++- fs/nsfs.c | 161 +++++++++++++++++++++++++++++++++++++++++++++ fs/proc/inode.c | 5 -- fs/proc/namespaces.c | 152 ++++-------------------------------------- include/linux/ns_common.h | 1 + include/linux/proc_ns.h | 31 +++++---- include/uapi/linux/magic.h | 1 + init/main.c | 2 + 10 files changed, 208 insertions(+), 161 deletions(-) create mode 100644 fs/nsfs.c (limited to 'include/uapi') diff --git a/fs/Makefile b/fs/Makefile index 34a1b9dea6dd..34393376eaa2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o fs_pin.o + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/internal.h b/fs/internal.h index 757ba2abf21e..e9a61fe67575 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -147,3 +147,8 @@ extern const struct file_operations pipefifo_fops; */ extern void sb_pin_kill(struct super_block *sb); extern void mnt_pin_kill(struct mount *m); + +/* + * fs/nsfs.c + */ +extern struct dentry_operations ns_dentry_operations; diff --git a/fs/namespace.c b/fs/namespace.c index 9dfb4cac0c41..30df6e7dd807 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1569,8 +1569,8 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { /* Is this a proxy for a mount namespace? */ - struct inode *inode = dentry->d_inode; - return proc_ns_inode(inode) && dentry->d_fsdata == &mntns_operations; + return dentry->d_op == &ns_dentry_operations && + dentry->d_fsdata == &mntns_operations; } struct mnt_namespace *to_mnt_ns(struct ns_common *ns) @@ -2016,7 +2016,10 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(parent) || !check_mnt(old)) + if (!check_mnt(parent)) + goto out2; + + if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) goto out2; if (!recurse && has_locked_children(old, old_path.dentry)) diff --git a/fs/nsfs.c b/fs/nsfs.c new file mode 100644 index 000000000000..af1b24fa899d --- /dev/null +++ b/fs/nsfs.c @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include +#include + +static struct vfsmount *nsfs_mnt; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +static void ns_prune_dentry(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + if (inode) { + struct ns_common *ns = inode->i_private; + atomic_long_set(&ns->stashed, 0); + } +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_prune = ns_prune_dentry, + .d_delete = always_delete_dentry, + .d_dname = ns_dname, +}; + +static void nsfs_evict(struct inode *inode) +{ + struct ns_common *ns = inode->i_private; + clear_inode(inode); + ns->ops->put(ns); +} + +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct vfsmount *mnt = mntget(nsfs_mnt); + struct qstr qname = { .name = "", }; + struct dentry *dentry; + struct inode *inode; + struct ns_common *ns; + unsigned long d; + +again: + ns = ns_ops->get(task); + if (!ns) { + mntput(mnt); + return ERR_PTR(-ENOENT); + } + rcu_read_lock(); + d = atomic_long_read(&ns->stashed); + if (!d) + goto slow; + dentry = (struct dentry *)d; + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto slow; + rcu_read_unlock(); + ns_ops->put(ns); +got_it: + path->mnt = mnt; + path->dentry = dentry; + return NULL; +slow: + rcu_read_unlock(); + inode = new_inode_pseudo(mnt->mnt_sb); + if (!inode) { + ns_ops->put(ns); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + inode->i_ino = ns->inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + inode->i_private = ns; + + dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); + if (!dentry) { + iput(inode); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + d_instantiate(dentry, inode); + dentry->d_fsdata = (void *)ns_ops; + d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); + if (d) { + d_delete(dentry); /* make sure ->d_prune() does nothing */ + dput(dentry); + cpu_relax(); + goto again; + } + goto got_it; +} + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + int res = -ENOENT; + ns = ns_ops->get(task); + if (ns) { + res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + ns_ops->put(ns); + } + return res; +} + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + +static const struct super_operations nsfs_ops = { + .statfs = simple_statfs, + .evict_inode = nsfs_evict, +}; +static struct dentry *nsfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, + &ns_dentry_operations, NSFS_MAGIC); +} +static struct file_system_type nsfs = { + .name = "nsfs", + .mount = nsfs_mount, + .kill_sb = kill_anon_super, +}; + +void __init nsfs_init(void) +{ + nsfs_mnt = kern_mount(&nsfs); + if (IS_ERR(nsfs_mnt)) + panic("can't set nsfs up\n"); + nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a212996e0987..57a9be9a6668 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -32,7 +32,6 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; - struct ns_common *ns; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -49,10 +48,6 @@ static void proc_evict_inode(struct inode *inode) RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } - /* Release any associated namespace */ - ns = PROC_I(inode)->ns.ns; - if (ns && ns->ops) - ns->ops->put(ns); } static struct kmem_cache * proc_inode_cachep; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 18fc1cf899de..aaaac77abad0 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -1,10 +1,6 @@ #include #include -#include #include -#include -#include -#include #include #include #include @@ -34,139 +30,45 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static const struct file_operations ns_file_operations = { - .llseek = no_llseek, -}; - -static const struct inode_operations ns_inode_operations = { - .setattr = proc_setattr, -}; - -static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = dentry->d_fsdata; - - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", - ns_ops->name, inode->i_ino); -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_delete = always_delete_dentry, - .d_dname = ns_dname, -}; - -static struct dentry *proc_ns_get_dentry(struct super_block *sb, - struct task_struct *task, const struct proc_ns_operations *ns_ops) -{ - struct dentry *dentry, *result; - struct inode *inode; - struct proc_inode *ei; - struct qstr qname = { .name = "", }; - struct ns_common *ns; - - ns = ns_ops->get(task); - if (!ns) - return ERR_PTR(-ENOENT); - - dentry = d_alloc_pseudo(sb, &qname); - if (!dentry) { - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - dentry->d_fsdata = (void *)ns_ops; - - inode = iget_locked(sb, ns->inum); - if (!inode) { - dput(dentry); - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - ei = PROC_I(inode); - if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns.ns_ops = ns_ops; - ei->ns.ns = ns; - unlock_new_inode(inode); - } else { - ns_ops->put(ns); - } - - d_set_d_op(dentry, &ns_dentry_operations); - result = d_instantiate_unique(dentry, inode); - if (result) { - dput(dentry); - dentry = result; - } - - return dentry; -} - static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; struct task_struct *task; struct path ns_path; void *error = ERR_PTR(-EACCES); task = get_proc_task(inode); if (!task) - goto out; + return error; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; - - ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); - if (IS_ERR(ns_path.dentry)) { - error = ERR_CAST(ns_path.dentry); - goto out_put_task; + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + error = ns_get_path(&ns_path, task, ns_ops); + if (!error) + nd_jump_link(nd, &ns_path); } - - ns_path.mnt = mntget(nd->path.mnt); - nd_jump_link(nd, &ns_path); - error = NULL; - -out_put_task: put_task_struct(task); -out: return error; } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = dentry->d_inode; - struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; struct task_struct *task; - struct ns_common *ns; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) - goto out; - - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; + return res; - res = -ENOENT; - ns = ns_ops->get(task); - if (!ns) - goto out_put_task; - - snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns->inum); - res = readlink_copy(buffer, buflen, name); - ns_ops->put(ns); -out_put_task: + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + res = ns_get_name(name, sizeof(name), task, ns_ops); + if (res >= 0) + res = readlink_copy(buffer, buflen, name); + } put_task_struct(task); -out: return res; } @@ -268,31 +170,3 @@ const struct inode_operations proc_ns_dir_inode_operations = { .getattr = pid_getattr, .setattr = proc_setattr, }; - -struct file *proc_ns_fget(int fd) -{ - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &ns_file_operations) - goto out_invalid; - - return file; - -out_invalid: - fput(file); - return ERR_PTR(-EINVAL); -} - -struct ns_common *get_proc_ns(struct inode *inode) -{ - return PROC_I(inode)->ns.ns; -} - -bool proc_ns_inode(struct inode *inode) -{ - return inode->i_fop == &ns_file_operations; -} diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index ce23cf4bbe69..85a5c8c16be9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,6 +4,7 @@ struct proc_ns_operations; struct ns_common { + atomic_long_t stashed; const struct proc_ns_operations *ops; unsigned int inum; }; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 2837ff41cfe3..42dfc615dbf8 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -4,9 +4,11 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H +#include + struct pid_namespace; struct nsproxy; -struct ns_common; +struct path; struct proc_ns_operations { const char *name; @@ -38,35 +40,38 @@ enum { extern int pid_ns_prepare_proc(struct pid_namespace *ns); extern void pid_ns_release_proc(struct pid_namespace *ns); -extern struct file *proc_ns_fget(int fd); -extern struct ns_common *get_proc_ns(struct inode *); extern int proc_alloc_inum(unsigned int *pino); extern void proc_free_inum(unsigned int inum); -extern bool proc_ns_inode(struct inode *inode); #else /* CONFIG_PROC_FS */ static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } static inline void pid_ns_release_proc(struct pid_namespace *ns) {} -static inline struct file *proc_ns_fget(int fd) -{ - return ERR_PTR(-EINVAL); -} - -static inline struct ns_common *get_proc_ns(struct inode *inode) { return NULL; } - static inline int proc_alloc_inum(unsigned int *inum) { *inum = 1; return 0; } static inline void proc_free_inum(unsigned int inum) {} -static inline bool proc_ns_inode(struct inode *inode) { return false; } #endif /* CONFIG_PROC_FS */ -#define ns_alloc_inum(ns) proc_alloc_inum(&(ns)->inum) +static inline int ns_alloc_inum(struct ns_common *ns) +{ + atomic_long_set(&ns->stashed, 0); + return proc_alloc_inum(&ns->inum); +} + #define ns_free_inum(ns) proc_free_inum((ns)->inum) +extern struct file *proc_ns_fget(int fd); +#define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) +extern void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); + +extern int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +extern void nsfs_init(void); + #endif /* _LINUX_PROC_NS_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 77c60311a6c6..7d664ea85ebd 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -72,5 +72,6 @@ #define MTD_INODE_FS_MAGIC 0x11307854 #define ANON_INODE_FS_MAGIC 0x09041934 #define BTRFS_TEST_MAGIC 0x73727279 +#define NSFS_MAGIC 0x6e736673 #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/main.c b/init/main.c index 800a0daede7e..bcc75057ea87 100644 --- a/init/main.c +++ b/init/main.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include @@ -660,6 +661,7 @@ asmlinkage __visible void __init start_kernel(void) /* rootfs populating might need page-writeback */ page_writeback_init(); proc_root_init(); + nsfs_init(); cgroup_init(); cpuset_init(); taskstats_init_early(); -- cgit v1.2.3 From 63f13448d81c910a284b096149411a719cbed501 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 9 Dec 2014 15:37:07 -0500 Subject: powerpc: add little endian flag to syscall_get_arch() Since both ppc and ppc64 have LE variants which are now reported by uname, add that flag (__AUDIT_ARCH_LE) to syscall_get_arch() and add AUDIT_ARCH_PPC64LE variant. Without this, perf trace and auditctl fail. Mainline kernel reports ppc64le (per a058801) but there is no matching AUDIT_ARCH_PPC64LE. Since 32-bit PPC LE is not supported by audit, don't advertise it in AUDIT_ARCH_PPC* variants. See: https://www.redhat.com/archives/linux-audit/2014-August/msg00082.html https://www.redhat.com/archives/linux-audit/2014-December/msg00004.html Signed-off-by: Richard Guy Briggs Acked-by: Paul Moore Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/syscall.h | 6 +++++- include/uapi/linux/audit.h | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 6240698fee9a..ff21b7a2f0cc 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,6 +90,10 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(void) { - return is_32bit_task() ? AUDIT_ARCH_PPC : AUDIT_ARCH_PPC64; + int arch = is_32bit_task() ? AUDIT_ARCH_PPC : AUDIT_ARCH_PPC64; +#ifdef __LITTLE_ENDIAN__ + arch |= __AUDIT_ARCH_LE; +#endif + return arch; } #endif /* _ASM_SYSCALL_H */ diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index d4dbef14d4df..584bb0113e25 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -365,7 +365,9 @@ enum { #define AUDIT_ARCH_PARISC (EM_PARISC) #define AUDIT_ARCH_PARISC64 (EM_PARISC|__AUDIT_ARCH_64BIT) #define AUDIT_ARCH_PPC (EM_PPC) +/* do not define AUDIT_ARCH_PPCLE since it is not supported by audit */ #define AUDIT_ARCH_PPC64 (EM_PPC64|__AUDIT_ARCH_64BIT) +#define AUDIT_ARCH_PPC64LE (EM_PPC64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) #define AUDIT_ARCH_S390 (EM_S390) #define AUDIT_ARCH_S390X (EM_S390|__AUDIT_ARCH_64BIT) #define AUDIT_ARCH_SH (EM_SH) -- cgit v1.2.3 From 51f39a1f0cea1cacf8c787f652f26dfee9611874 Mon Sep 17 00:00:00 2001 From: David Drysdale Date: Fri, 12 Dec 2014 16:57:29 -0800 Subject: syscalls: implement execveat() system call This patchset adds execveat(2) for x86, and is derived from Meredydd Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528). The primary aim of adding an execveat syscall is to allow an implementation of fexecve(3) that does not rely on the /proc filesystem, at least for executables (rather than scripts). The current glibc version of fexecve(3) is implemented via /proc, which causes problems in sandboxed or otherwise restricted environments. Given the desire for a /proc-free fexecve() implementation, HPA suggested (https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be an appropriate generalization. Also, having a new syscall means that it can take a flags argument without back-compatibility concerns. The current implementation just defines the AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be added in future -- for example, flags for new namespaces (as suggested at https://lkml.org/lkml/2006/7/11/474). Related history: - https://lkml.org/lkml/2006/12/27/123 is an example of someone realizing that fexecve() is likely to fail in a chroot environment. - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered documenting the /proc requirement of fexecve(3) in its manpage, to "prevent other people from wasting their time". - https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a problem where a process that did setuid() could not fexecve() because it no longer had access to /proc/self/fd; this has since been fixed. This patch (of 4): Add a new execveat(2) system call. execveat() is to execve() as openat() is to open(): it takes a file descriptor that refers to a directory, and resolves the filename relative to that. In addition, if the filename is empty and AT_EMPTY_PATH is specified, execveat() executes the file to which the file descriptor refers. This replicates the functionality of fexecve(), which is a system call in other UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/" (and so relies on /proc being mounted). The filename fed to the executed program as argv[0] (or the name of the script fed to a script interpreter) will be of the form "/dev/fd/" (for an empty filename) or "/dev/fd//", effectively reflecting how the executable was found. This does however mean that execution of a script in a /proc-less environment won't work; also, script execution via an O_CLOEXEC file descriptor fails (as the file will not be accessible after exec). Based on patches by Meredydd Luff. Signed-off-by: David Drysdale Cc: Meredydd Luff Cc: Shuah Khan Cc: "Eric W. Biederman" Cc: Andy Lutomirski Cc: Alexander Viro Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Kees Cook Cc: Arnd Bergmann Cc: Rich Felker Cc: Christoph Hellwig Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_em86.c | 4 ++ fs/binfmt_misc.c | 4 ++ fs/binfmt_script.c | 10 ++++ fs/exec.c | 113 +++++++++++++++++++++++++++++++++----- fs/namei.c | 2 +- include/linux/binfmts.h | 4 ++ include/linux/compat.h | 3 + include/linux/fs.h | 1 + include/linux/sched.h | 4 ++ include/linux/syscalls.h | 5 ++ include/uapi/asm-generic/unistd.h | 4 +- kernel/sys_ni.c | 3 + lib/audit.c | 3 + 13 files changed, 145 insertions(+), 15 deletions(-) (limited to 'include/uapi') diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f37b08cea1f7..490538536cb4 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm) return -ENOEXEC; } + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 70789e198dea..c04ef1d4f18a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -144,6 +144,10 @@ static int load_misc_binary(struct linux_binprm *bprm) if (!fmt) goto ret; + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); if (retval) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 5027a3e14922..afdf4e3cafc2 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm) if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; + + /* + * If the script filename will be inaccessible after exec, typically + * because it is a "/dev/fd//.." path against an O_CLOEXEC fd, give + * up now (on the assumption that the interpreter will want to load + * this file). + */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT diff --git a/fs/exec.c b/fs/exec.c index 01aebe300200..ad8798e26be9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -static struct file *do_open_exec(struct filename *name) +static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; - static const struct open_flags open_exec_flags = { + struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, name, &open_exec_flags); + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return ERR_PTR(-EINVAL); + if (flags & AT_SYMLINK_NOFOLLOW) + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + open_exec_flags.lookup_flags |= LOOKUP_EMPTY; + + file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -770,12 +777,13 @@ static struct file *do_open_exec(struct filename *name) if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - fsnotify_open(file); - err = deny_write_access(file); if (err) goto exit; + if (name->name[0] != '\0') + fsnotify_open(file); + out: return file; @@ -787,7 +795,7 @@ exit: struct file *open_exec(const char *name) { struct filename tmp = { .name = name }; - return do_open_exec(&tmp); + return do_open_execat(AT_FDCWD, &tmp, 0); } EXPORT_SYMBOL(open_exec); @@ -1428,10 +1436,12 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp) +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) { + char *pathbuf = NULL; struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; @@ -1472,7 +1482,7 @@ static int do_execve_common(struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_exec(filename); + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1480,7 +1490,28 @@ static int do_execve_common(struct filename *filename, sched_exec(); bprm->file = file; - bprm->filename = bprm->interp = filename->name; + if (fd == AT_FDCWD || filename->name[0] == '/') { + bprm->filename = filename->name; + } else { + if (filename->name[0] == '\0') + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); + else + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", + fd, filename->name); + if (!pathbuf) { + retval = -ENOMEM; + goto out_unmark; + } + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. Relies on having exclusive access to + * current->files (due to unshare_files above). + */ + if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = pathbuf; + } + bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) @@ -1521,6 +1552,7 @@ static int do_execve_common(struct filename *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + kfree(pathbuf); putname(filename); if (displaced) put_files_struct(displaced); @@ -1538,6 +1570,7 @@ out_unmark: out_free: free_bprm(bprm); + kfree(pathbuf); out_files: if (displaced) @@ -1553,7 +1586,18 @@ int do_execve(struct filename *filename, { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +int do_execveat(int fd, struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT @@ -1569,7 +1613,23 @@ static int compat_do_execve(struct filename *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int compat_do_execveat(int fd, struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(fd, filename, argv, envp, flags); } #endif @@ -1609,6 +1669,20 @@ SYSCALL_DEFINE3(execve, { return do_execve(getname(filename), argv, envp); } + +SYSCALL_DEFINE5(execveat, + int, fd, const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, @@ -1616,4 +1690,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, { return compat_do_execve(getname(filename), argv, envp); } + +COMPAT_SYSCALL_DEFINE5(execveat, int, fd, + const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return compat_do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} #endif diff --git a/fs/namei.c b/fs/namei.c index db5fe86319e6..ca814165d84c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -130,7 +130,7 @@ void final_putname(struct filename *name) #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) -static struct filename * +struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result, *err; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 61f29e5ea840..576e4639ca60 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -53,6 +53,10 @@ struct linux_binprm { #define BINPRM_FLAGS_EXECFD_BIT 1 #define BINPRM_FLAGS_EXECFD (1 << BINPRM_FLAGS_EXECFD_BIT) +/* filename of the binary will be inaccessible after exec */ +#define BINPRM_FLAGS_PATH_INACCESSIBLE_BIT 2 +#define BINPRM_FLAGS_PATH_INACCESSIBLE (1 << BINPRM_FLAGS_PATH_INACCESSIBLE_BIT) + /* Function parameter for binfmt->coredump */ struct coredump_params { const siginfo_t *siginfo; diff --git a/include/linux/compat.h b/include/linux/compat.h index e6494261eaff..7450ca2ac1fc 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -357,6 +357,9 @@ asmlinkage long compat_sys_lseek(unsigned int, compat_off_t, unsigned int); asmlinkage long compat_sys_execve(const char __user *filename, const compat_uptr_t __user *argv, const compat_uptr_t __user *envp); +asmlinkage long compat_sys_execveat(int dfd, const char __user *filename, + const compat_uptr_t __user *argv, + const compat_uptr_t __user *envp, int flags); asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, diff --git a/include/linux/fs.h b/include/linux/fs.h index 1d1838de6882..4193a0bd99b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2096,6 +2096,7 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); +extern struct filename *getname_flags(const char __user *, int, int *); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cfdbcf8cf56..8db31ef98d2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2485,6 +2485,10 @@ extern void do_group_exit(int); extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); +extern int do_execveat(int, struct filename *, + const char __user * const __user *, + const char __user * const __user *, + int); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c9afdc7a7f84..85893d744901 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -877,4 +877,9 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); + +asmlinkage long sys_execveat(int dfd, const char __user *filename, + const char __user *const __user *argv, + const char __user *const __user *envp, int flags); + #endif diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 22749c134117..e016bd9b1a04 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -707,9 +707,11 @@ __SYSCALL(__NR_getrandom, sys_getrandom) __SYSCALL(__NR_memfd_create, sys_memfd_create) #define __NR_bpf 280 __SYSCALL(__NR_bpf, sys_bpf) +#define __NR_execveat 281 +__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat) #undef __NR_syscalls -#define __NR_syscalls 281 +#define __NR_syscalls 282 /* * All syscalls below here should go away really, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 61eea02b53f5..5adcb0ae3a58 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -226,3 +226,6 @@ cond_syscall(sys_seccomp); /* access BPF programs and maps */ cond_syscall(sys_bpf); + +/* execveat */ +cond_syscall(sys_execveat); diff --git a/lib/audit.c b/lib/audit.c index 1d726a22565b..b8fb5ee81e26 100644 --- a/lib/audit.c +++ b/lib/audit.c @@ -53,6 +53,9 @@ int audit_classify_syscall(int abi, unsigned syscall) #ifdef __NR_socketcall case __NR_socketcall: return 4; +#endif +#ifdef __NR_execveat + case __NR_execveat: #endif case __NR_execve: return 5; -- cgit v1.2.3 From e843e7d2c88b7db107a86bd2c7145dc715c058f4 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Fri, 12 Dec 2014 16:58:14 -0800 Subject: ipc/sem.c: increase SEMMSL, SEMMNI, SEMOPM a) SysV can be abused to allocate locked kernel memory. For most systems, a small limit doesn't make sense, see the discussion with regards to SHMMAX. Therefore: Increase the sysv sem limits so that all known applications will work with these defaults. b) With regards to the maximum supported: Some of the specified hard limits are not correct anymore, therefore the patch updates the documentation. - SEMMNI must stay below IPCMNI, which is 32768. As for SHMMAX: Stay a bit below this limit. - SEMMSL was limited to 8k, to ensure that the kmalloc for the kernel array was limited to 16 kB (order=2) This doesn't apply anymore: - the allocation size isn't sizeof(short)*nsems anymore. - ipc_alloc falls back to vmalloc - SEMOPM should stay below 1000, to limit the kmalloc in semtimedop() to an order=1 allocation. Therefore: Leave it at 500 (order=0 allocation). Note: If an administrator must limit the memory allocations, then he can set the values as necessary. Or he can disable sysv entirely (as e.g. done by Android). Signed-off-by: Manfred Spraul Cc: Davidlohr Bueso Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/sem.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sem.h b/include/uapi/linux/sem.h index 541fce03b50c..dd73b908b2f3 100644 --- a/include/uapi/linux/sem.h +++ b/include/uapi/linux/sem.h @@ -63,10 +63,22 @@ struct seminfo { int semaem; }; -#define SEMMNI 128 /* <= IPCMNI max # of semaphore identifiers */ -#define SEMMSL 250 /* <= 8 000 max num of semaphores per id */ +/* + * SEMMNI, SEMMSL and SEMMNS are default values which can be + * modified by sysctl. + * The values has been chosen to be larger than necessary for any + * known configuration. + * + * SEMOPM should not be increased beyond 1000, otherwise there is the + * risk that semop()/semtimedop() fails due to kernel memory fragmentation when + * allocating the sop array. + */ + + +#define SEMMNI 32000 /* <= IPCMNI max # of semaphore identifiers */ +#define SEMMSL 32000 /* <= INT_MAX max num of semaphores per id */ #define SEMMNS (SEMMNI*SEMMSL) /* <= INT_MAX max # of semaphores in system */ -#define SEMOPM 32 /* <= 1 000 max num of ops per semop call */ +#define SEMOPM 500 /* <= 1 000 max num of ops per semop call */ #define SEMVMX 32767 /* <= 32767 semaphore maximum value */ #define SEMAEM SEMVMX /* adjust on exit max value */ -- cgit v1.2.3 From 0050ee059f7fc86b1df2527aaa14ed5dc72f9973 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Fri, 12 Dec 2014 16:58:17 -0800 Subject: ipc/msg: increase MSGMNI, remove scaling SysV can be abused to allocate locked kernel memory. For most systems, a small limit doesn't make sense, see the discussion with regards to SHMMAX. Therefore: increase MSGMNI to the maximum supported. And: If we ignore the risk of locking too much memory, then an automatic scaling of MSGMNI doesn't make sense. Therefore the logic can be removed. The code preserves auto_msgmni to avoid breaking any user space applications that expect that the value exists. Notes: 1) If an administrator must limit the memory allocations, then he can set MSGMNI as necessary. Or he can disable sysv entirely (as e.g. done by Android). 2) MSGMAX and MSGMNB are intentionally not increased, as these values are used to control latency vs. throughput: If MSGMNB is large, then msgsnd() just returns and more messages can be queued before a task switch to a task that calls msgrcv() is forced. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Manfred Spraul Cc: Davidlohr Bueso Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 10 +++-- include/linux/ipc_namespace.h | 20 --------- include/uapi/linux/msg.h | 28 +++++++++---- ipc/Makefile | 2 +- ipc/ipc_sysctl.c | 93 ++++++++--------------------------------- ipc/ipcns_notifier.c | 92 ---------------------------------------- ipc/msg.c | 36 +--------------- ipc/namespace.c | 22 ---------- ipc/util.c | 40 ------------------ 9 files changed, 45 insertions(+), 298 deletions(-) delete mode 100644 ipc/ipcns_notifier.c (limited to 'include/uapi') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index b5d0c8501a18..75511efefc64 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -116,10 +116,12 @@ set during run time. auto_msgmni: -Enables/Disables automatic recomputing of msgmni upon memory add/remove -or upon ipc namespace creation/removal (see the msgmni description -above). Echoing "1" into this file enables msgmni automatic recomputing. -Echoing "0" turns it off. auto_msgmni default value is 1. +This variable has no effect and may be removed in future kernel +releases. Reading it always returns 0. +Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni +upon memory add/remove or upon ipc namespace creation/removal. +Echoing "1" into this file enabled msgmni automatic recomputing. +Echoing "0" turned it off. auto_msgmni default value was 1. ============================================================== diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 35e7eca4e33b..e365d5ec69cb 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -7,15 +7,6 @@ #include #include -/* - * ipc namespace events - */ -#define IPCNS_MEMCHANGED 0x00000001 /* Notify lowmem size changed */ -#define IPCNS_CREATED 0x00000002 /* Notify new ipc namespace created */ -#define IPCNS_REMOVED 0x00000003 /* Notify ipc namespace removed */ - -#define IPCNS_CALLBACK_PRI 0 - struct user_namespace; struct ipc_ids { @@ -38,7 +29,6 @@ struct ipc_namespace { unsigned int msg_ctlmni; atomic_t msg_bytes; atomic_t msg_hdrs; - int auto_msgmni; size_t shm_ctlmax; size_t shm_ctlall; @@ -77,18 +67,8 @@ extern atomic_t nr_ipc_ns; extern spinlock_t mq_lock; #ifdef CONFIG_SYSVIPC -extern int register_ipcns_notifier(struct ipc_namespace *); -extern int cond_register_ipcns_notifier(struct ipc_namespace *); -extern void unregister_ipcns_notifier(struct ipc_namespace *); -extern int ipcns_notify(unsigned long); extern void shm_destroy_orphaned(struct ipc_namespace *ns); #else /* CONFIG_SYSVIPC */ -static inline int register_ipcns_notifier(struct ipc_namespace *ns) -{ return 0; } -static inline int cond_register_ipcns_notifier(struct ipc_namespace *ns) -{ return 0; } -static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { } -static inline int ipcns_notify(unsigned long l) { return 0; } static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #endif /* CONFIG_SYSVIPC */ diff --git a/include/uapi/linux/msg.h b/include/uapi/linux/msg.h index a70375526578..f51c8001dbe5 100644 --- a/include/uapi/linux/msg.h +++ b/include/uapi/linux/msg.h @@ -51,16 +51,28 @@ struct msginfo { }; /* - * Scaling factor to compute msgmni: - * the memory dedicated to msg queues (msgmni * msgmnb) should occupy - * at most 1/MSG_MEM_SCALE of the lowmem (see the formula in ipc/msg.c): - * up to 8MB : msgmni = 16 (MSGMNI) - * 4 GB : msgmni = 8K - * more than 16 GB : msgmni = 32K (IPCMNI) + * MSGMNI, MSGMAX and MSGMNB are default values which can be + * modified by sysctl. + * + * MSGMNI is the upper limit for the number of messages queues per + * namespace. + * It has been chosen to be as large possible without facilitating + * scenarios where userspace causes overflows when adjusting the limits via + * operations of the form retrieve current limit; add X; update limit". + * + * MSGMNB is the default size of a new message queue. Non-root tasks can + * decrease the size with msgctl(IPC_SET), root tasks + * (actually: CAP_SYS_RESOURCE) can both increase and decrease the queue + * size. The optimal value is application dependent. + * 16384 is used because it was always used (since 0.99.10) + * + * MAXMAX is the maximum size of an individual message, it's a global + * (per-namespace) limit that applies for all message queues. + * It's set to 1/2 of MSGMNB, to ensure that at least two messages fit into + * the queue. This is also an arbitrary choice (since 2.6.0). */ -#define MSG_MEM_SCALE 32 -#define MSGMNI 16 /* <= IPCMNI */ /* max # of msg queue identifiers */ +#define MSGMNI 32000 /* <= IPCMNI */ /* max # of msg queue identifiers */ #define MSGMAX 8192 /* <= INT_MAX */ /* max size of message (bytes) */ #define MSGMNB 16384 /* <= INT_MAX */ /* default max size of a message queue */ diff --git a/ipc/Makefile b/ipc/Makefile index 9075e172e52c..86c7300ecdf5 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o syscall.o obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o obj_mq-$(CONFIG_COMPAT) += compat_mq.o obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index e8075b247497..8ad93c29f511 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -62,29 +62,6 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, return err; } -static int proc_ipc_callback_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - size_t lenp_bef = *lenp; - int rc; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); - - if (write && !rc && lenp_bef == *lenp) - /* - * Tunable has successfully been changed by hand. Disable its - * automatic adjustment. This simply requires unregistering - * the notifiers that trigger recalculation. - */ - unregister_ipcns_notifier(current->nsproxy->ipc_ns); - - return rc; -} - static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -96,54 +73,19 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, lenp, ppos); } -/* - * Routine that is called when the file "auto_msgmni" has successfully been - * written. - * Two values are allowed: - * 0: unregister msgmni's callback routine from the ipc namespace notifier - * chain. This means that msgmni won't be recomputed anymore upon memory - * add/remove or ipc namespace creation/removal. - * 1: register back the callback routine. - */ -static void ipc_auto_callback(int val) -{ - if (!val) - unregister_ipcns_notifier(current->nsproxy->ipc_ns); - else { - /* - * Re-enable automatic recomputing only if not already - * enabled. - */ - recompute_msgmni(current->nsproxy->ipc_ns); - cond_register_ipcns_notifier(current->nsproxy->ipc_ns); - } -} - -static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, +static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; - int oldval; - int rc; + int dummy = 0; memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - oldval = *((int *)(ipc_table.data)); + ipc_table.data = &dummy; - rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); + if (write) + pr_info_once("writing to auto_msgmni has no effect"); - if (write && !rc) { - int newval = *((int *)(ipc_table.data)); - /* - * The file "auto_msgmni" has correctly been set. - * React by (un)registering the corresponding tunable, if the - * value has changed. - */ - if (newval != oldval) - ipc_auto_callback(newval); - } - - return rc; + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } #else @@ -151,8 +93,7 @@ static int proc_ipcauto_dointvec_minmax(struct ctl_table *table, int write, #define proc_ipc_dointvec NULL #define proc_ipc_dointvec_minmax NULL #define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_callback_dointvec_minmax NULL -#define proc_ipcauto_dointvec_minmax NULL +#define proc_ipc_auto_msgmni NULL #endif static int zero; @@ -204,10 +145,19 @@ static struct ctl_table ipc_kern_table[] = { .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_callback_dointvec_minmax, + .proc_handler = proc_ipc_dointvec_minmax, .extra1 = &zero, .extra2 = &int_max, }, + { + .procname = "auto_msgmni", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_ipc_auto_msgmni, + .extra1 = &zero, + .extra2 = &one, + }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, @@ -224,15 +174,6 @@ static struct ctl_table ipc_kern_table[] = { .mode = 0644, .proc_handler = proc_ipc_dointvec, }, - { - .procname = "auto_msgmni", - .data = &init_ipc_ns.auto_msgmni, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_ipcauto_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #ifdef CONFIG_CHECKPOINT_RESTORE { .procname = "sem_next_id", diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c deleted file mode 100644 index b9b31a4f77e1..000000000000 --- a/ipc/ipcns_notifier.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * linux/ipc/ipcns_notifier.c - * Copyright (C) 2007 BULL SA. Nadia Derbey - * - * Notification mechanism for ipc namespaces: - * The callback routine registered in the memory chain invokes the ipcns - * notifier chain with the IPCNS_MEMCHANGED event. - * Each callback routine registered in the ipcns namespace recomputes msgmni - * for the owning namespace. - */ - -#include -#include -#include -#include -#include - -#include "util.h" - - - -static BLOCKING_NOTIFIER_HEAD(ipcns_chain); - - -static int ipcns_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct ipc_namespace *ns; - - switch (action) { - case IPCNS_MEMCHANGED: /* amount of lowmem has changed */ - case IPCNS_CREATED: - case IPCNS_REMOVED: - /* - * It's time to recompute msgmni - */ - ns = container_of(self, struct ipc_namespace, ipcns_nb); - /* - * No need to get a reference on the ns: the 1st job of - * free_ipc_ns() is to unregister the callback routine. - * blocking_notifier_chain_unregister takes the wr lock to do - * it. - * When this callback routine is called the rd lock is held by - * blocking_notifier_call_chain. - * So the ipc ns cannot be freed while we are here. - */ - recompute_msgmni(ns); - break; - default: - break; - } - - return NOTIFY_OK; -} - -int register_ipcns_notifier(struct ipc_namespace *ns) -{ - int rc; - - memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); - ns->ipcns_nb.notifier_call = ipcns_callback; - ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; - rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb); - if (!rc) - ns->auto_msgmni = 1; - return rc; -} - -int cond_register_ipcns_notifier(struct ipc_namespace *ns) -{ - int rc; - - memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb)); - ns->ipcns_nb.notifier_call = ipcns_callback; - ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI; - rc = blocking_notifier_chain_cond_register(&ipcns_chain, - &ns->ipcns_nb); - if (!rc) - ns->auto_msgmni = 1; - return rc; -} - -void unregister_ipcns_notifier(struct ipc_namespace *ns) -{ - blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb); - ns->auto_msgmni = 0; -} - -int ipcns_notify(unsigned long val) -{ - return blocking_notifier_call_chain(&ipcns_chain, val, NULL); -} diff --git a/ipc/msg.c b/ipc/msg.c index c5d8e3749985..a7261d5cbc89 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -989,43 +989,12 @@ SYSCALL_DEFINE5(msgrcv, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, return do_msgrcv(msqid, msgp, msgsz, msgtyp, msgflg, do_msg_fill); } -/* - * Scale msgmni with the available lowmem size: the memory dedicated to msg - * queues should occupy at most 1/MSG_MEM_SCALE of lowmem. - * Also take into account the number of nsproxies created so far. - * This should be done staying within the (MSGMNI , IPCMNI/nr_ipc_ns) range. - */ -void recompute_msgmni(struct ipc_namespace *ns) -{ - struct sysinfo i; - unsigned long allowed; - int nb_ns; - - si_meminfo(&i); - allowed = (((i.totalram - i.totalhigh) / MSG_MEM_SCALE) * i.mem_unit) - / MSGMNB; - nb_ns = atomic_read(&nr_ipc_ns); - allowed /= nb_ns; - - if (allowed < MSGMNI) { - ns->msg_ctlmni = MSGMNI; - return; - } - - if (allowed > IPCMNI / nb_ns) { - ns->msg_ctlmni = IPCMNI / nb_ns; - return; - } - - ns->msg_ctlmni = allowed; -} void msg_init_ns(struct ipc_namespace *ns) { ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; - - recompute_msgmni(ns); + ns->msg_ctlmni = MSGMNI; atomic_set(&ns->msg_bytes, 0); atomic_set(&ns->msg_hdrs, 0); @@ -1069,9 +1038,6 @@ void __init msg_init(void) { msg_init_ns(&init_ipc_ns); - printk(KERN_INFO "msgmni has been set to %d\n", - init_ipc_ns.msg_ctlmni); - ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); diff --git a/ipc/namespace.c b/ipc/namespace.c index b54468e48e32..1a3ffd40356e 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -45,14 +45,6 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, msg_init_ns(ns); shm_init_ns(ns); - /* - * msgmni has already been computed for the new ipc ns. - * Thus, do the ipcns creation notification before registering that - * new ipcns in the chain. - */ - ipcns_notify(IPCNS_CREATED); - register_ipcns_notifier(ns); - ns->user_ns = get_user_ns(user_ns); return ns; @@ -99,25 +91,11 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { - /* - * Unregistering the hotplug notifier at the beginning guarantees - * that the ipc namespace won't be freed while we are inside the - * callback routine. Since the blocking_notifier_chain_XXX routines - * hold a rw lock on the notifier list, unregister_ipcns_notifier() - * won't take the rw lock before blocking_notifier_call_chain() has - * released the rd lock. - */ - unregister_ipcns_notifier(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); atomic_dec(&nr_ipc_ns); - /* - * Do the ipcns removal notification after decrementing nr_ipc_ns in - * order to have a correct value when recomputing msgmni. - */ - ipcns_notify(IPCNS_REMOVED); put_user_ns(ns->user_ns); proc_free_inum(ns->proc_inum); kfree(ns); diff --git a/ipc/util.c b/ipc/util.c index 88adc329888c..106bed0378ab 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -71,44 +71,6 @@ struct ipc_proc_iface { int (*show)(struct seq_file *, void *); }; -static void ipc_memory_notifier(struct work_struct *work) -{ - ipcns_notify(IPCNS_MEMCHANGED); -} - -static int ipc_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - static DECLARE_WORK(ipc_memory_wq, ipc_memory_notifier); - - switch (action) { - case MEM_ONLINE: /* memory successfully brought online */ - case MEM_OFFLINE: /* or offline: it's time to recompute msgmni */ - /* - * This is done by invoking the ipcns notifier chain with the - * IPC_MEMCHANGED event. - * In order not to keep the lock on the hotplug memory chain - * for too long, queue a work item that will, when waken up, - * activate the ipcns notification chain. - */ - schedule_work(&ipc_memory_wq); - break; - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block ipc_memory_nb = { - .notifier_call = ipc_memory_callback, - .priority = IPC_CALLBACK_PRI, -}; - /** * ipc_init - initialise ipc subsystem * @@ -124,8 +86,6 @@ static int __init ipc_init(void) sem_init(); msg_init(); shm_init(); - register_hotmemory_notifier(&ipc_memory_nb); - register_ipcns_notifier(&init_ipc_ns); return 0; } device_initcall(ipc_init); -- cgit v1.2.3 From 0dce3771fdd1db03c6f498594a34bba5a6d1f870 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 11 Dec 2014 13:46:52 +0200 Subject: virtio_pci: add VIRTIO_PCI_NO_LEGACY Add macro to disable all legacy register defines. Helpful to make sure legacy macros don't leak through into modern code. Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_pci.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index e5ec1caab82a..35b552c7f330 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -41,6 +41,8 @@ #include +#ifndef VIRTIO_PCI_NO_LEGACY + /* A 32-bit r/o bitmask of the features supported by the host */ #define VIRTIO_PCI_HOST_FEATURES 0 @@ -67,16 +69,11 @@ * a read-and-acknowledge. */ #define VIRTIO_PCI_ISR 19 -/* The bit of the ISR which indicates a device configuration change. */ -#define VIRTIO_PCI_ISR_CONFIG 0x2 - /* MSI-X registers: only enabled if MSI-X is enabled. */ /* A 16-bit vector for configuration changes. */ #define VIRTIO_MSI_CONFIG_VECTOR 20 /* A 16-bit vector for selected queue notifications. */ #define VIRTIO_MSI_QUEUE_VECTOR 22 -/* Vector value used to disable MSI for queue */ -#define VIRTIO_MSI_NO_VECTOR 0xffff /* The remaining space is defined by each driver as the per-driver * configuration space */ @@ -94,4 +91,12 @@ /* The alignment to use between consumer and producer parts of vring. * x86 pagesize again. */ #define VIRTIO_PCI_VRING_ALIGN 4096 + +#endif /* VIRTIO_PCI_NO_LEGACY */ + +/* The bit of the ISR which indicates a device configuration change. */ +#define VIRTIO_PCI_ISR_CONFIG 0x2 +/* Vector value used to disable MSI for queue */ +#define VIRTIO_MSI_NO_VECTOR 0xffff + #endif -- cgit v1.2.3 From 5a77abf9a97a7ecc8fb0f6bf4ad411fb12b02f31 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 11 Dec 2014 17:04:15 +0200 Subject: IB/core: Add support for extended query device caps Add extensible query device capabilities verb to allow adding new features. ib_uverbs_ex_query_device is added and copy_query_dev_fields is used to copy capability fields to be used by both ib_uverbs_query_device and ib_uverbs_ex_query_device. Signed-off-by: Eli Cohen Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 124 +++++++++++++++++++++++----------- drivers/infiniband/core/uverbs_main.c | 3 +- include/rdma/ib_verbs.h | 5 +- include/uapi/rdma/ib_user_verbs.h | 14 +++- 5 files changed, 103 insertions(+), 44 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 643c08a025a5..b716b0815644 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); +IB_UVERBS_DECLARE_EX_CMD(query_device); #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ba2a86aab6a..c7a43624c96b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -378,6 +378,52 @@ err: return ret; } +static void copy_query_dev_fields(struct ib_uverbs_file *file, + struct ib_uverbs_query_device_resp *resp, + struct ib_device_attr *attr) +{ + resp->fw_ver = attr->fw_ver; + resp->node_guid = file->device->ib_dev->node_guid; + resp->sys_image_guid = attr->sys_image_guid; + resp->max_mr_size = attr->max_mr_size; + resp->page_size_cap = attr->page_size_cap; + resp->vendor_id = attr->vendor_id; + resp->vendor_part_id = attr->vendor_part_id; + resp->hw_ver = attr->hw_ver; + resp->max_qp = attr->max_qp; + resp->max_qp_wr = attr->max_qp_wr; + resp->device_cap_flags = attr->device_cap_flags; + resp->max_sge = attr->max_sge; + resp->max_sge_rd = attr->max_sge_rd; + resp->max_cq = attr->max_cq; + resp->max_cqe = attr->max_cqe; + resp->max_mr = attr->max_mr; + resp->max_pd = attr->max_pd; + resp->max_qp_rd_atom = attr->max_qp_rd_atom; + resp->max_ee_rd_atom = attr->max_ee_rd_atom; + resp->max_res_rd_atom = attr->max_res_rd_atom; + resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; + resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; + resp->atomic_cap = attr->atomic_cap; + resp->max_ee = attr->max_ee; + resp->max_rdd = attr->max_rdd; + resp->max_mw = attr->max_mw; + resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; + resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; + resp->max_mcast_grp = attr->max_mcast_grp; + resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; + resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; + resp->max_ah = attr->max_ah; + resp->max_fmr = attr->max_fmr; + resp->max_map_per_fmr = attr->max_map_per_fmr; + resp->max_srq = attr->max_srq; + resp->max_srq_wr = attr->max_srq_wr; + resp->max_srq_sge = attr->max_srq_sge; + resp->max_pkeys = attr->max_pkeys; + resp->local_ca_ack_delay = attr->local_ca_ack_delay; + resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; +} + ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -398,47 +444,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, return ret; memset(&resp, 0, sizeof resp); - - resp.fw_ver = attr.fw_ver; - resp.node_guid = file->device->ib_dev->node_guid; - resp.sys_image_guid = attr.sys_image_guid; - resp.max_mr_size = attr.max_mr_size; - resp.page_size_cap = attr.page_size_cap; - resp.vendor_id = attr.vendor_id; - resp.vendor_part_id = attr.vendor_part_id; - resp.hw_ver = attr.hw_ver; - resp.max_qp = attr.max_qp; - resp.max_qp_wr = attr.max_qp_wr; - resp.device_cap_flags = attr.device_cap_flags; - resp.max_sge = attr.max_sge; - resp.max_sge_rd = attr.max_sge_rd; - resp.max_cq = attr.max_cq; - resp.max_cqe = attr.max_cqe; - resp.max_mr = attr.max_mr; - resp.max_pd = attr.max_pd; - resp.max_qp_rd_atom = attr.max_qp_rd_atom; - resp.max_ee_rd_atom = attr.max_ee_rd_atom; - resp.max_res_rd_atom = attr.max_res_rd_atom; - resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; - resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; - resp.atomic_cap = attr.atomic_cap; - resp.max_ee = attr.max_ee; - resp.max_rdd = attr.max_rdd; - resp.max_mw = attr.max_mw; - resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; - resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; - resp.max_mcast_grp = attr.max_mcast_grp; - resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; - resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; - resp.max_ah = attr.max_ah; - resp.max_fmr = attr.max_fmr; - resp.max_map_per_fmr = attr.max_map_per_fmr; - resp.max_srq = attr.max_srq; - resp.max_srq_wr = attr.max_srq_wr; - resp.max_srq_sge = attr.max_srq_sge; - resp.max_pkeys = attr.max_pkeys; - resp.local_ca_ack_delay = attr.local_ca_ack_delay; - resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; + copy_query_dev_fields(file, &resp, &attr); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -3253,3 +3259,39 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return ret ? ret : in_len; } + +int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ + struct ib_uverbs_ex_query_device_resp resp; + struct ib_uverbs_ex_query_device cmd; + struct ib_device_attr attr; + struct ib_device *device; + int err; + + device = file->device->ib_dev; + if (ucore->inlen < sizeof(cmd)) + return -EINVAL; + + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + if (cmd.reserved) + return -EINVAL; + + err = device->query_device(device, &attr); + if (err) + return err; + + memset(&resp, 0, sizeof(resp)); + copy_query_dev_fields(file, &resp.base, &attr); + resp.comp_mask = 0; + + err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); + if (err) + return err; + + return 0; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 71ab83fde472..974025028790 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, - [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, + [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device }; static void ib_uverbs_add_one(struct ib_device *device); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 470a011d6fa4..97a999f9e4d8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1662,7 +1662,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { - return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; + size_t copy_sz; + + copy_sz = min_t(size_t, len, udata->outlen); + return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0; } /** diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 26daf55ff76e..e8a96071e352 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -90,8 +90,9 @@ enum { }; enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_EX_CMD_DESTROY_FLOW + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, }; /* @@ -201,6 +202,17 @@ struct ib_uverbs_query_device_resp { __u8 reserved[4]; }; +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 reserved; +}; + struct ib_uverbs_query_port { __u64 response; __u8 port_num; -- cgit v1.2.3 From 860f10a799c83e38a69d5a69d80da5312a4c4106 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 11 Dec 2014 17:04:16 +0200 Subject: IB/core: Add flags for on demand paging support * Add a configuration option for enable on-demand paging support in the infiniband subsystem (CONFIG_INFINIBAND_ON_DEMAND_PAGING). In a later patch, this configuration option will select the MMU_NOTIFIER configuration option to enable mmu notifiers. * Add a flag for on demand paging (ODP) support in the IB device capabilities. * Add a flag to request ODP MR in the access flags to reg_mr. * Fail registrations done with the ODP flag when the low-level driver doesn't support this. * Change the conditions in which an MR will be writable to explicitly specify the access flags. This is to avoid making an MR writable just because it is an ODP MR. * Add a ODP capabilities to the extended query device verb. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 10 ++++++++++ drivers/infiniband/core/umem.c | 8 +++++--- drivers/infiniband/core/uverbs_cmd.c | 25 +++++++++++++++++++++++++ include/rdma/ib_verbs.h | 28 ++++++++++++++++++++++++++-- include/uapi/rdma/ib_user_verbs.h | 15 +++++++++++++++ 5 files changed, 81 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 77089399359b..089a2c2af329 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -38,6 +38,16 @@ config INFINIBAND_USER_MEM depends on INFINIBAND_USER_ACCESS != n default y +config INFINIBAND_ON_DEMAND_PAGING + bool "InfiniBand on-demand paging support" + depends on INFINIBAND_USER_MEM + default y + ---help--- + On demand paging support for the InfiniBand subsystem. + Together with driver support this allows registration of + memory regions without pinning their pages, fetching the + pages on demand instead. + config INFINIBAND_ADDR_TRANS bool depends on INFINIBAND diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 6f152628e0d2..c328e4693d14 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -107,13 +107,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->page_size = PAGE_SIZE; umem->pid = get_task_pid(current, PIDTYPE_PID); /* - * We ask for writable memory if any access flags other than - * "remote read" are set. "Local write" and "remote write" + * We ask for writable memory if any of the following + * access flags are set. "Local write" and "remote write" * obviously require write access. "Remote atomic" can do * things like fetch and add, which will modify memory, and * "MW bind" can change permissions by binding a window. */ - umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + umem->writable = !!(access & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c7a43624c96b..f9326ccda4b5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -953,6 +953,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, goto err_free; } + if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { + struct ib_device_attr attr; + + ret = ib_query_device(pd->device, &attr); + if (ret || !(attr.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { + pr_debug("ODP support not available\n"); + ret = -EINVAL; + goto err_put; + } + } + mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, cmd.access_flags, &udata); if (IS_ERR(mr)) { @@ -3289,6 +3301,19 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, copy_query_dev_fields(file, &resp.base, &attr); resp.comp_mask = 0; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) { + resp.odp_caps.general_caps = attr.odp_caps.general_caps; + resp.odp_caps.per_transport_caps.rc_odp_caps = + attr.odp_caps.per_transport_caps.rc_odp_caps; + resp.odp_caps.per_transport_caps.uc_odp_caps = + attr.odp_caps.per_transport_caps.uc_odp_caps; + resp.odp_caps.per_transport_caps.ud_odp_caps = + attr.odp_caps.per_transport_caps.ud_odp_caps; + resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP; + } +#endif + err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); if (err) return err; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 97a999f9e4d8..a41bc5a39ebf 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -123,7 +123,8 @@ enum ib_device_cap_flags { IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), - IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) + IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), + IB_DEVICE_ON_DEMAND_PAGING = (1<<31), }; enum ib_signature_prot_cap { @@ -143,6 +144,27 @@ enum ib_atomic_cap { IB_ATOMIC_GLOB }; +enum ib_odp_general_cap_bits { + IB_ODP_SUPPORT = 1 << 0, +}; + +enum ib_odp_transport_cap_bits { + IB_ODP_SUPPORT_SEND = 1 << 0, + IB_ODP_SUPPORT_RECV = 1 << 1, + IB_ODP_SUPPORT_WRITE = 1 << 2, + IB_ODP_SUPPORT_READ = 1 << 3, + IB_ODP_SUPPORT_ATOMIC = 1 << 4, +}; + +struct ib_odp_caps { + uint64_t general_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + } per_transport_caps; +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; @@ -186,6 +208,7 @@ struct ib_device_attr { u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; + struct ib_odp_caps odp_caps; }; enum ib_mtu { @@ -1073,7 +1096,8 @@ enum ib_access_flags { IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), - IB_ZERO_BASED = (1<<5) + IB_ZERO_BASED = (1<<5), + IB_ACCESS_ON_DEMAND = (1<<6), }; struct ib_phys_buf { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index e8a96071e352..4275b961bf60 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -202,15 +202,30 @@ struct ib_uverbs_query_device_resp { __u8 reserved[4]; }; +enum { + IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0, +}; + struct ib_uverbs_ex_query_device { __u32 comp_mask; __u32 reserved; }; +struct ib_uverbs_odp_caps { + __u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_query_device_resp base; __u32 comp_mask; __u32 reserved; + struct ib_uverbs_odp_caps odp_caps; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From 5eea84f478537de769330079dc068414f9d417f4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 16 Dec 2014 15:05:01 +0200 Subject: if_tun: add TUNSETVNETLE/TUNGETVNETLE ifreq flags field is only 16 bit wide, so setting IFF_VNET_LE there has no effect: doesn't fit in two bytes. The tests passed apparently because they have an even number of bugs, all cancelling out. Luckily we didn't release a kernel with this flag, so it's not too late to fix this. Add TUNSETVNETLE/TUNGETVNETLE to really achieve the purpose of IFF_VNET_LE. This has an added benefit that if we ever want a BE flag, we won't have to deal with weird configurations like setting both LE and BE at the same time. IFF_VNET_LE will be dropped in a follow-up patch. Reported-by: Dan Carpenter Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 18b2403982f9..274630caa276 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -48,6 +48,8 @@ #define TUNSETQUEUE _IOW('T', 217, int) #define TUNSETIFINDEX _IOW('T', 218, unsigned int) #define TUNGETFILTER _IOR('T', 219, struct sock_fprog) +#define TUNSETVNETLE _IOW('T', 220, int) +#define TUNGETVNETLE _IOR('T', 221, int) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3 From 9c6ab1931fd6198eab61d9d59aff9a1014637ace Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 16 Dec 2014 15:05:23 +0200 Subject: if_tun: drop broken IFF_VNET_LE Everyone should use TUNSETVNETLE/TUNGETVNETLE instead. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 274630caa276..50ae24335444 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -59,7 +59,6 @@ #define IFF_ONE_QUEUE 0x2000 #define IFF_VNET_HDR 0x4000 #define IFF_TUN_EXCL 0x8000 -#define IFF_VNET_LE 0x10000 #define IFF_MULTI_QUEUE 0x0100 #define IFF_ATTACH_QUEUE 0x0200 #define IFF_DETACH_QUEUE 0x0400 -- cgit v1.2.3 From f2d27e4519e5373767815d2050293c628242e34f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 5 Dec 2014 10:02:47 -0300 Subject: [media] v4l2-mediabus.h: use two __u16 instead of two __u32 The ycbcr_enc and quantization fields do not need a __u32. Switch to two __u16 types, thus preserving alignment and avoiding holes in the struct. This makes one more __u32 available for future expansion. Suggested by Sakari Ailus. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/v4l2-mediabus.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h index 5a86d8ede09c..26db20647e6f 100644 --- a/include/uapi/linux/v4l2-mediabus.h +++ b/include/uapi/linux/v4l2-mediabus.h @@ -31,9 +31,9 @@ struct v4l2_mbus_framefmt { __u32 code; __u32 field; __u32 colorspace; - __u32 ycbcr_enc; - __u32 quantization; - __u32 reserved[5]; + __u16 ycbcr_enc; + __u16 quantization; + __u32 reserved[6]; }; #ifndef __KERNEL__ -- cgit v1.2.3 From 3875f15207f9ecb3f24a8e91e7ad196899139595 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Thu, 18 Dec 2014 12:57:14 -0500 Subject: uapi/linux/target_core_user.h: fix headers_install.sh badness scripts/headers_install.sh will transform __packed to __attribute__((packed)), so the #ifndef is not necessary. (and, in fact, it's problematic, because we'll end up with the header containing: #ifndef __attribute__((packed)) #define __attribu... and so forth.) Cc: stable@vger.kernel.org # 3.18 Signed-off-by: Kyle McMartin Signed-off-by: Nicholas Bellinger --- include/uapi/linux/target_core_user.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index 7dcfbe6771b1..b483d1909d3e 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -6,10 +6,6 @@ #include #include -#ifndef __packed -#define __packed __attribute__((packed)) -#endif - #define TCMU_VERSION "1.0" /* -- cgit v1.2.3 From 6d08acd2d32e3e877579315dc3202d7a5f336d98 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Sat, 20 Dec 2014 12:15:49 -0800 Subject: in6: fix conflict with glibc Resolve conflicts between glibc definition of IPV6 socket options and those defined in Linux headers. Looks like earlier efforts to solve this did not cover all the definitions. It resolves warnings during iproute2 build. Please consider for stable as well. Signed-off-by: Stephen Hemminger Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 3 ++- include/uapi/linux/libc-compat.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 74a2a1773494..79b12b004ade 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -149,7 +149,7 @@ struct in6_flowlabel_req { /* * IPV6 socket options */ - +#if __UAPI_DEF_IPV6_OPTIONS #define IPV6_ADDRFORM 1 #define IPV6_2292PKTINFO 2 #define IPV6_2292HOPOPTS 3 @@ -196,6 +196,7 @@ struct in6_flowlabel_req { #define IPV6_IPSEC_POLICY 34 #define IPV6_XFRM_POLICY 35 +#endif /* * Multicast: diff --git a/include/uapi/linux/libc-compat.h b/include/uapi/linux/libc-compat.h index c140620dad92..e28807ad17fa 100644 --- a/include/uapi/linux/libc-compat.h +++ b/include/uapi/linux/libc-compat.h @@ -69,6 +69,7 @@ #define __UAPI_DEF_SOCKADDR_IN6 0 #define __UAPI_DEF_IPV6_MREQ 0 #define __UAPI_DEF_IPPROTO_V6 0 +#define __UAPI_DEF_IPV6_OPTIONS 0 #else @@ -82,6 +83,7 @@ #define __UAPI_DEF_SOCKADDR_IN6 1 #define __UAPI_DEF_IPV6_MREQ 1 #define __UAPI_DEF_IPPROTO_V6 1 +#define __UAPI_DEF_IPV6_OPTIONS 1 #endif /* _NETINET_IN_H */ @@ -103,6 +105,7 @@ #define __UAPI_DEF_SOCKADDR_IN6 1 #define __UAPI_DEF_IPV6_MREQ 1 #define __UAPI_DEF_IPPROTO_V6 1 +#define __UAPI_DEF_IPV6_OPTIONS 1 /* Definitions for xattr.h */ #define __UAPI_DEF_XATTR 1 -- cgit v1.2.3 From b28e0506fafd9c987bba7a6a71ea02a37fcabdea Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 25 Dec 2014 16:03:27 +0200 Subject: virtio_ring: document alignment requirements Host needs to know vring element alignment requirements: simply doing alignof on structures doesn't work reliably: on some platforms gcc has alignof(uint32_t) == 2. Add macros for alignment as specified in virtio 1.0 cs01, export them to userspace as well. Acked-by: Rusty Russell Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_ring.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 61c818a7fe70..a3318f31e8e7 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -101,6 +101,13 @@ struct vring { struct vring_used *used; }; +/* Alignment requirements for vring elements. + * When using pre-virtio 1.0 layout, these fall out naturally. + */ +#define VRING_AVAIL_ALIGN_SIZE 2 +#define VRING_USED_ALIGN_SIZE 4 +#define VRING_DESC_ALIGN_SIZE 16 + /* The standard layout for the ring is a continuous chunk of memory which looks * like this. We assume num is a power of 2. * -- cgit v1.2.3 From b81c55db1053805866a242cd0bfbfb0c60c499b3 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 29 Dec 2014 15:24:25 +0200 Subject: drm/amdkfd: reformat IOCTL definitions to drm-style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch reformats the ioctl definitions in kfd_ioctl.h to be similar to the drm ioctls definition style. v2: Renamed KFD_COMMAND_(START|END) to AMDKFD_... Signed-off-by: Oded Gabbay Acked-by: Christian König --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 15 +++++++------ include/uapi/linux/kfd_ioctl.h | 37 +++++++++++++++++++------------- 2 files changed, 30 insertions(+), 22 deletions(-) (limited to 'include/uapi') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 249f4921f4a8..6fbde9e411e0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -523,35 +523,36 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) switch (cmd) { - case KFD_IOC_GET_VERSION: + case AMDKFD_IOC_GET_VERSION: retcode = kfd_ioctl_get_version(filep, process, kdata); break; - case KFD_IOC_CREATE_QUEUE: + + case AMDKFD_IOC_CREATE_QUEUE: retcode = kfd_ioctl_create_queue(filep, process, kdata); break; - case KFD_IOC_DESTROY_QUEUE: + case AMDKFD_IOC_DESTROY_QUEUE: retcode = kfd_ioctl_destroy_queue(filep, process, kdata); break; - case KFD_IOC_SET_MEMORY_POLICY: + case AMDKFD_IOC_SET_MEMORY_POLICY: retcode = kfd_ioctl_set_memory_policy(filep, process, kdata); break; - case KFD_IOC_GET_CLOCK_COUNTERS: + case AMDKFD_IOC_GET_CLOCK_COUNTERS: retcode = kfd_ioctl_get_clock_counters(filep, process, kdata); break; - case KFD_IOC_GET_PROCESS_APERTURES: + case AMDKFD_IOC_GET_PROCESS_APERTURES: retcode = kfd_ioctl_get_process_apertures(filep, process, kdata); break; - case KFD_IOC_UPDATE_QUEUE: + case AMDKFD_IOC_UPDATE_QUEUE: retcode = kfd_ioctl_update_queue(filep, process, kdata); break; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 7acef41fc209..af94f31e33ac 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -128,27 +128,34 @@ struct kfd_ioctl_get_process_apertures_args { uint32_t pad; }; -#define KFD_IOC_MAGIC 'K' +#define AMDKFD_IOCTL_BASE 'K' +#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) +#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) +#define AMDKFD_IOW(nr, type) _IOW(AMDKFD_IOCTL_BASE, nr, type) +#define AMDKFD_IOWR(nr, type) _IOWR(AMDKFD_IOCTL_BASE, nr, type) -#define KFD_IOC_GET_VERSION \ - _IOR(KFD_IOC_MAGIC, 1, struct kfd_ioctl_get_version_args) +#define AMDKFD_IOC_GET_VERSION \ + AMDKFD_IOR(0x01, struct kfd_ioctl_get_version_args) -#define KFD_IOC_CREATE_QUEUE \ - _IOWR(KFD_IOC_MAGIC, 2, struct kfd_ioctl_create_queue_args) +#define AMDKFD_IOC_CREATE_QUEUE \ + AMDKFD_IOWR(0x02, struct kfd_ioctl_create_queue_args) -#define KFD_IOC_DESTROY_QUEUE \ - _IOWR(KFD_IOC_MAGIC, 3, struct kfd_ioctl_destroy_queue_args) +#define AMDKFD_IOC_DESTROY_QUEUE \ + AMDKFD_IOWR(0x03, struct kfd_ioctl_destroy_queue_args) -#define KFD_IOC_SET_MEMORY_POLICY \ - _IOW(KFD_IOC_MAGIC, 4, struct kfd_ioctl_set_memory_policy_args) +#define AMDKFD_IOC_SET_MEMORY_POLICY \ + AMDKFD_IOW(0x04, struct kfd_ioctl_set_memory_policy_args) -#define KFD_IOC_GET_CLOCK_COUNTERS \ - _IOWR(KFD_IOC_MAGIC, 5, struct kfd_ioctl_get_clock_counters_args) +#define AMDKFD_IOC_GET_CLOCK_COUNTERS \ + AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args) -#define KFD_IOC_GET_PROCESS_APERTURES \ - _IOR(KFD_IOC_MAGIC, 6, struct kfd_ioctl_get_process_apertures_args) +#define AMDKFD_IOC_GET_PROCESS_APERTURES \ + AMDKFD_IOR(0x06, struct kfd_ioctl_get_process_apertures_args) -#define KFD_IOC_UPDATE_QUEUE \ - _IOW(KFD_IOC_MAGIC, 7, struct kfd_ioctl_update_queue_args) +#define AMDKFD_IOC_UPDATE_QUEUE \ + AMDKFD_IOW(0x07, struct kfd_ioctl_update_queue_args) + +#define AMDKFD_COMMAND_START 0x01 +#define AMDKFD_COMMAND_END 0x08 #endif -- cgit v1.2.3 From 75069f2b5bfb5164beafaf3da597279c25b5535a Mon Sep 17 00:00:00 2001 From: David Drysdale Date: Thu, 8 Jan 2015 14:32:29 -0800 Subject: vfs: renumber FMODE_NONOTIFY and add to uniqueness check Fix clashing values for O_PATH and FMODE_NONOTIFY on sparc. The clashing O_PATH value was added in commit 5229645bdc35 ("vfs: add nonconflicting values for O_PATH") but this can't be changed as it is user-visible. FMODE_NONOTIFY is only used internally in the kernel, but it is in the same numbering space as the other O_* flags, as indicated by the comment at the top of include/uapi/asm-generic/fcntl.h (and its use in fs/notify/fanotify/fanotify_user.c). So renumber it to avoid the clash. All of this has happened before (commit 12ed2e36c98a: "fanotify: FMODE_NONOTIFY and __O_SYNC in sparc conflict"), and all of this will happen again -- so update the uniqueness check in fcntl_init() to include __FMODE_NONOTIFY. Signed-off-by: David Drysdale Acked-by: David S. Miller Acked-by: Jan Kara Cc: Heinrich Schuchardt Cc: Alexander Viro Cc: Arnd Bergmann Cc: Stephen Rothwell Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 5 +++-- include/linux/fs.h | 2 +- include/uapi/asm-generic/fcntl.h | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/fs/fcntl.c b/fs/fcntl.c index 99d440a4a6ba..ee85cd4e136a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -740,14 +740,15 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE + __FMODE_EXEC | O_PATH | __O_TMPFILE | + __FMODE_NONOTIFY )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/include/linux/fs.h b/include/linux/fs.h index f90c0282c114..42efe13077b6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_CAN_WRITE ((__force fmode_t)0x40000) /* File was opened by fanotify and shouldn't generate fanotify events */ -#define FMODE_NONOTIFY ((__force fmode_t)0x1000000) +#define FMODE_NONOTIFY ((__force fmode_t)0x4000000) /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 7543b3e51331..e063effe0cc1 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -5,7 +5,7 @@ /* * FMODE_EXEC is 0x20 - * FMODE_NONOTIFY is 0x1000000 + * FMODE_NONOTIFY is 0x4000000 * These cannot be used by userspace O_* until internal and external open * flags are split. * -Eric Paris -- cgit v1.2.3